├── .gitignore ├── AGAThA.sh ├── AGAThA ├── LICENSE ├── Makefile ├── README.md ├── build.sh ├── configure.sh ├── src │ ├── Timer.h │ ├── args_parser.cpp │ ├── args_parser.h │ ├── ctors.cpp │ ├── ctors.h │ ├── gasal.h │ ├── gasal_align.cu │ ├── gasal_align.h │ ├── gasal_header.h │ ├── gasal_kernels.h │ ├── host_batch.cpp │ ├── host_batch.h │ ├── interfaces.cpp │ ├── interfaces.h │ ├── kernels │ │ ├── agatha_kernel.h │ │ └── pack_rc_seqs.h │ ├── res.cpp │ └── res.h └── test_prog │ ├── Makefile │ ├── README.md │ ├── Timer.h │ └── test_prog.cpp ├── README.md ├── dataset ├── query.fasta └── ref.fasta ├── docker ├── Dockerfile ├── build.sh └── launch.sh └── misc └── avg_time.py /.gitignore: -------------------------------------------------------------------------------- 1 | */lib/* 2 | */include/* 3 | */obj/* 4 | .vscode/* 5 | *.log 6 | output/ 7 | 8 | *.o 9 | *.cuo 10 | *.cppo 11 | *.out 12 | *.txt 13 | 14 | *.json 15 | 16 | #Temporary addition 17 | manual -------------------------------------------------------------------------------- /AGAThA.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MAIN_DIR="/agatha_ae/" #the main directory 3 | PROG_DIR=$MAIN_DIR"AGAThA/test_prog/" #the directory where the test program is 4 | OUTPUT_DIR=$MAIN_DIR"output/" #the directory for the RAW_, FINAL_, SCORE_FILE 5 | DATASET_DIR=$MAIN_DIR"dataset/" #the directory where the input dataset is located in 6 | FINAL_DIR=$PWD 7 | 8 | RAW_FILE=$OUTPUT_DIR"raw.log" #stores all kernel exec. time of all iterations 9 | FINAL_FILE=$OUTPUT_DIR"time.json" #stores the average (total kernel exec. time of a single iteration) 10 | SCORE_FILE=$OUTPUT_DIR"score.log" #stores the scores after alignment 11 | 12 | ITER=1 #number of iteration of each program 13 | IDLE=5 #sleep between iterations 14 | DATASET_NAME="test" #the name for the current dataset (will be shown in FINAL_FILE) 15 | PROCESS="AGAThA" #the process name (will be shown in FINAL_FILE) 16 | 17 | while getopts "i:" opt 18 | do 19 | case "$opt" in 20 | i ) ITER="$OPTARG" ;; 21 | esac 22 | done 23 | 24 | mkdir -p $OUTPUT_DIR #creating the output directory 25 | 26 | echo ">>> Running $PROCESS for $ITER iterations." 27 | 28 | if [ -f $RAW_FILE ]; then #remove the output files before running the program 29 | rm $RAW_FILE 30 | fi 31 | 32 | if [ -f $SCORE_FILE ]; then 33 | rm $SCORE_FILE 34 | fi 35 | 36 | if [ -f $FINAL_FILE ]; then 37 | rm $FINAL_FILE 38 | fi 39 | 40 | iter=0 #start the main program 41 | while [ "$iter" -lt $ITER ] 42 | do 43 | echo ">> Iteration $(($iter+1))" 44 | ${PROG_DIR}manual -p -m 1 -x 4 -q 6 -r 2 -s 3 -z 400 -w 751 ${DATASET_DIR}ref.fasta ${DATASET_DIR}query.fasta ${RAW_FILE} > ${SCORE_FILE} 45 | ((iter++)) 46 | sleep ${IDLE}s 47 | done 48 | 49 | echo "$PROCESS complete." 50 | echo "Creating output files..." #creating additional output files 51 | 52 | python3 /agatha_ae/misc/avg_time.py $PROCESS $DATASET_NAME ${RAW_FILE} ${FINAL_FILE} $ITER 53 | 54 | echo "Complete." -------------------------------------------------------------------------------- /AGAThA/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /AGAThA/Makefile: -------------------------------------------------------------------------------- 1 | GPU_SM_ARCH=sm_86 2 | #GPU_SM_ARCH=sm_75 3 | MAX_QUERY_LEN=10000 4 | N_CODE=0x4E 5 | N_PENALTY=1 6 | 7 | GPU_COMPUTE_ARCH=$(subst sm,compute,$(GPU_SM_ARCH)) 8 | NVCC=/usr/local/cuda/bin/nvcc 9 | CC=g++ 10 | SRC_DIR=./src/ 11 | OBJ_DIR=./obj/ 12 | LIB_DIR=./lib/ 13 | INCLUDE_DIR=./include/ 14 | 15 | SOURCES= args_parser.cpp host_batch.cpp ctors.cpp interfaces.cpp res.cpp gasal_align.cu 16 | LOBJS=$(patsubst %,%o,$(SOURCES)) 17 | 18 | LOBJS_PATH=$(addprefix $(OBJ_DIR),$(LOBJS)) 19 | VPATH=src:obj:lib 20 | YELLOW=\033[1;33m 21 | NC=\033[0m # No Color 22 | 23 | ifeq ($(GPU_SM_ARCH),) 24 | error1: 25 | @echo "Must specify GPU architecture as sm_xx" 26 | endif 27 | ifeq ($(MAX_QUERY_LEN),) 28 | error2: 29 | @echo "Must specify maximum sequence length" 30 | endif 31 | 32 | ifeq ($(N_CODE),) 33 | error3: 34 | @echo "Must specify the code for 'N'" 35 | endif 36 | #ifneq ($(GPU_SM_ARCH),clean) 37 | 38 | 39 | 40 | 41 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: --compiler-options -fPIC 42 | ## With Debian and clang, use: $(NVCC) -ccbin clang-3.8 --compiler-options -fpie 43 | 44 | ifeq ($(N_PENALTY),) 45 | %.cuo: %.cu 46 | $(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE) -Xptxas -Werror --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@ 47 | 48 | else 49 | %.cuo: %.cu 50 | $(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE),-DN_PENALTY=$(N_PENALTY) -Xptxas -Werror --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@ 51 | 52 | endif 53 | 54 | 55 | 56 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: -fPIC 57 | ifeq ($(N_PENALTY),) 58 | %.cppo: %.cpp 59 | $(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -Werror $< -o $(OBJ_DIR)$@ 60 | 61 | else 62 | %.cppo: %.cpp 63 | $(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -DN_PENALTY=$(N_PENALTY) -Werror $< -o $(OBJ_DIR)$@ 64 | 65 | endif 66 | 67 | 68 | all: clean makedir libgasal.a 69 | 70 | makedir: 71 | @mkdir -p $(OBJ_DIR) 72 | @mkdir -p $(LIB_DIR) 73 | @mkdir -p $(INCLUDE_DIR) 74 | @cp $(SRC_DIR)/*.h $(INCLUDE_DIR) 75 | @sed -i "s/MAX_QUERY_LEN=[0-9]\{1,9\}/MAX_QUERY_LEN=$(MAX_QUERY_LEN)/" ./test_prog/Makefile 76 | 77 | ifeq ($(N_PENALTY),) 78 | libgasal.a: $(LOBJS) 79 | ar -csru $(LIB_DIR)$@ $(LOBJS_PATH) 80 | @echo "" 81 | @echo -e "${YELLOW}WARNING:${NC}\"N_PENALTY\" is not defined" 82 | else 83 | libgasal.a: $(LOBJS) 84 | ar -csru $(LIB_DIR)$@ $(LOBJS_PATH) 85 | endif 86 | 87 | clean: 88 | rm -f -r $(OBJ_DIR) $(LIB_DIR) $(INCLUDE_DIR) *~ *.exe *.cppo *.cuo *.txt *~ 89 | 90 | gasal_align.cuo: gasal.h gasal_kernels.h 91 | 92 | 93 | -------------------------------------------------------------------------------- /AGAThA/README.md: -------------------------------------------------------------------------------- 1 | # AGAThA [![DOI](https://zenodo.org/badge/725514536.svg)](https://zenodo.org/doi/10.5281/zenodo.10225634) 2 | 3 | AGAThA is built on top of [GASAL2](https://github.com/nahmedraja/GASAL2.git). 4 | 5 | TBA 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /AGAThA/build.sh: -------------------------------------------------------------------------------- 1 | make 2 | cd test_prog 3 | make 4 | cd .. -------------------------------------------------------------------------------- /AGAThA/configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | cuda_path=$1 5 | RED='\033[0;31m' 6 | NC='\033[0m' # No Color 7 | 8 | if [ "$cuda_path" = "" ]; then 9 | echo -e "${RED}Must provide path to CUDA installation directory${NC}" 10 | echo -e "${RED}Configuration incomplete${NC}" 11 | echo -e "${RED}Exiting${NC}" 12 | exit 1 13 | fi 14 | 15 | cuda_nvcc_path=$cuda_path/bin/nvcc 16 | 17 | if [ -f $cuda_nvcc_path ]; then 18 | echo "NVCC found ($cuda_nvcc_path)" 19 | else 20 | echo -e "${RED}NVCC not found${NC}" 21 | echo -e "${RED}Configuration incomplete${NC}" 22 | echo -e "${RED}Exiting${NC}" 23 | exit 1 24 | fi 25 | 26 | 27 | cuda_lib_path="${cuda_path}/targets/x86_64-linux/lib" 28 | 29 | 30 | if [ -d $cuda_lib_path ]; then 31 | echo "CUDA runtime library found (${cuda_lib_path})" 32 | else 33 | echo -e "${RED}CUDA runtime library not found${NC}" 34 | echo -e "${RED}Configuration incomplete${NC}" 35 | echo -e "${RED}Exiting${NC}" 36 | exit 1 37 | fi 38 | 39 | cuda_runtime_file="${cuda_path}/targets/x86_64-linux/include/cuda_runtime.h" 40 | 41 | if [ -f $cuda_runtime_file ]; then 42 | echo "CUDA runtime header file found (${cuda_runtime_file})" 43 | else 44 | echo -e "${RED}CUDA runtime header file not found${NC}" 45 | echo -e "${RED}Configuration incomplete${NC}" 46 | echo -e "${RED}Exiting${NC}" 47 | exit 1 48 | fi 49 | 50 | 51 | echo "Configuring Makefile..." 52 | 53 | sed -i "s,NVCC=.*,NVCC=$cuda_nvcc_path,g" Makefile 54 | 55 | echo "Configuring gasal.h..." 56 | 57 | sed -i "s,.*cuda_runtime\.h\",\#include \"$cuda_runtime_file\",g" ./src/gasal.h 58 | 59 | echo "Configuring Makefile of test program..." 60 | 61 | sed -i "s,CUDA_LD_LIBRARY=.*,CUDA_LD_LIBRARY=$cuda_lib_path,g" ./test_prog/Makefile 62 | 63 | #mkdir -p include 64 | 65 | #cp ./src/gasal.h ./include 66 | 67 | echo "Done" 68 | 69 | 70 | -------------------------------------------------------------------------------- /AGAThA/src/Timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class Timer 10 | { 11 | private: 12 | struct timeval startTime; 13 | struct timeval stopTime; 14 | double elapsedTime; 15 | std::string name; 16 | 17 | public: 18 | Timer(std::string n) { name = n; elapsedTime = 0.0;} 19 | Timer() { name = ""; elapsedTime = 0.0;} 20 | void Clear() { elapsedTime = 0.0; } 21 | void Start() { gettimeofday(&(startTime), NULL); } 22 | void Restart() 23 | { 24 | elapsedTime = 0.0; 25 | gettimeofday(&(startTime), NULL); 26 | } 27 | 28 | void Pause() 29 | { 30 | gettimeofday(&(stopTime), NULL); 31 | 32 | elapsedTime += ( (stopTime).tv_sec - (startTime).tv_sec) * 1000.0; // sec to ms 33 | elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0; // us to ms 34 | } 35 | 36 | void Stop() 37 | { 38 | gettimeofday(&(stopTime), NULL); 39 | 40 | elapsedTime = ( (stopTime).tv_sec - (startTime).tv_sec) * 1000.0; // sec to ms 41 | elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0; // us to ms 42 | } 43 | 44 | void Print() 45 | { 46 | std::cout << name << " : " << elapsedTime << " msec" << std::endl; 47 | } 48 | 49 | double GetTime() { return elapsedTime;} 50 | 51 | }; 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /AGAThA/src/args_parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "args_parser.h" 5 | 6 | 7 | 8 | Parameters::Parameters(int argc_, char **argv_) { 9 | 10 | 11 | // default values 12 | sa = (2); 13 | sb = (4); 14 | gapo = (4); 15 | gape = (2); 16 | 17 | print_out = (0); 18 | n_threads = (1); 19 | // For AGAThA 20 | slice_width = (3); 21 | z_threshold = (400); 22 | band_width = (751); 23 | kernel_align_num = (8192); 24 | kernel_block_num = (256); 25 | kernel_thread_num = (256); 26 | 27 | isPacked = false; 28 | isReverseComplement = false; 29 | 30 | query_batch_fasta_filename = ""; 31 | target_batch_fasta_filename = ""; 32 | raw_filename = ""; 33 | 34 | argc = argc_; 35 | argv = argv_; 36 | 37 | } 38 | 39 | Parameters::~Parameters() { 40 | query_batch_fasta.close(); 41 | target_batch_fasta.close(); 42 | raw_file.close(); 43 | } 44 | 45 | void Parameters::print() { 46 | std::cerr << "sa=" << sa <<" , sb=" << sb <<" , gapo=" << gapo << " , gape="< " << std::endl; 75 | std::cerr << "Options: -m INT match score ["<< sa <<"]" << std::endl; 76 | std::cerr << " -x INT mismatch penalty [" << sb << "]"<< std::endl; 77 | std::cerr << " -q INT gap open penalty [" << gapo << "]" << std::endl; 78 | std::cerr << " -r INT gap extension penalty ["<< gape <<"]" << std::endl; 79 | std::cerr << " -s (AGAThA) slice_width" << std::endl; 80 | std::cerr << " -z (AGAThA) z-drop threshold" << std::endl; 81 | std::cerr << " -w (AGAThA) band width" << std::endl; 82 | std::cerr << " -b (AGAThA) number of blocks called per kernel" << std::endl; 83 | std::cerr << " -t (AGAThA) number of threads in a block called per kernel" << std::endl; 84 | std::cerr << " -a (AGAThA) number of alignments computed per kernel" << std::endl; 85 | std::cerr << " -p print the alignment results and time" << std::endl; 86 | std::cerr << " -n INT Number of CPU threads ["<< n_threads<<"]" << std::endl; 87 | std::cerr << " --help, -h : displays this message." << std::endl; 88 | std::cerr << "Single-pack multi-Parameters (e.g. -sp) is not supported." << std::endl; 89 | std::cerr << " " << std::endl; 90 | } 91 | 92 | 93 | void Parameters::parse() { 94 | 95 | // before testing anything, check if calling for help. 96 | int c; 97 | 98 | std::string arg_next = ""; 99 | std::string arg_cur = ""; 100 | 101 | for (c = 1; c < argc; c++) 102 | { 103 | arg_cur = std::string((const char*) (*(argv + c) ) ); 104 | arg_next = ""; 105 | if (!arg_cur.compare("--help") || !arg_cur.compare("-h")) 106 | { 107 | help(); 108 | exit(0); 109 | } 110 | } 111 | 112 | if (argc < 4) 113 | { 114 | failure(NOT_ENOUGH_ARGS); 115 | } 116 | 117 | for (c = 1; c < argc - 3; c++) 118 | { 119 | arg_cur = std::string((const char*) (*(argv + c) ) ); 120 | if (arg_cur.at(0) == '-' && arg_cur.at(1) == '-' ) 121 | { 122 | if (!arg_cur.compare("--help")) 123 | { 124 | help(); 125 | exit(0); 126 | } 127 | 128 | } else if (arg_cur.at(0) == '-' ) 129 | { 130 | if (arg_cur.length() > 2) 131 | failure(WRONG_ARG); 132 | char param = arg_cur.at(1); 133 | switch(param) 134 | { 135 | case 'm': 136 | c++; 137 | arg_next = std::string((const char*) (*(argv + c) ) ); 138 | sa = std::stoi(arg_next); 139 | break; 140 | case 'x': 141 | c++; 142 | arg_next = std::string((const char*) (*(argv + c) ) ); 143 | sb = std::stoi(arg_next); 144 | break; 145 | case 'q': 146 | c++; 147 | arg_next = std::string((const char*) (*(argv + c) ) ); 148 | gapo = std::stoi(arg_next); 149 | break; 150 | case 'r': 151 | c++; 152 | arg_next = std::string((const char*) (*(argv + c) ) ); 153 | gape = std::stoi(arg_next); 154 | break; 155 | case 'p': 156 | print_out = 1; 157 | break; 158 | case 'n': 159 | c++; 160 | arg_next = std::string((const char*) (*(argv + c) ) ); 161 | n_threads = std::stoi(arg_next); 162 | break; 163 | case 's': 164 | c++; 165 | arg_next = std::string((const char*) (*(argv + c) ) ); 166 | slice_width = std::stoi(arg_next); 167 | break; 168 | case 'z': 169 | c++; 170 | arg_next = std::string((const char*) (*(argv + c) ) ); 171 | z_threshold = std::stoi(arg_next); 172 | break; 173 | case 'w': 174 | c++; 175 | arg_next = std::string((const char*) (*(argv + c) ) ); 176 | band_width = std::stoi(arg_next); 177 | break; 178 | case 'b': 179 | c++; 180 | arg_next = std::string((const char*) (*(argv + c) ) ); 181 | kernel_block_num = std::stoi(arg_next); 182 | break; 183 | case 't': 184 | c++; 185 | arg_next = std::string((const char*) (*(argv + c) ) ); 186 | kernel_thread_num = std::stoi(arg_next); 187 | break; 188 | case 'a': 189 | c++; 190 | arg_next = std::string((const char*) (*(argv + c) ) ); 191 | kernel_align_num = std::stoi(arg_next); 192 | break; 193 | 194 | } 195 | 196 | 197 | } else { 198 | failure(WRONG_ARG); 199 | } 200 | } 201 | 202 | 203 | // the last 2 Parameters are the 2 filenames. 204 | query_batch_fasta_filename = std::string( (const char*) (*(argv + c) ) ); 205 | c++; 206 | target_batch_fasta_filename = std::string( (const char*) (*(argv + c) ) ); 207 | 208 | if (print_out) { 209 | c++; 210 | raw_filename = std::string( (const char*) (*(argv + c) ) ); 211 | } 212 | 213 | // Parameters retrieved successfully, open files. 214 | fileopen(); 215 | } 216 | 217 | void Parameters::fileopen() { 218 | query_batch_fasta.open(query_batch_fasta_filename, std::ifstream::in); 219 | if (!query_batch_fasta) 220 | failure(WRONG_FILES); 221 | 222 | target_batch_fasta.open(target_batch_fasta_filename); 223 | if (!target_batch_fasta) 224 | failure(WRONG_FILES); 225 | 226 | if (print_out) { 227 | raw_file.open(raw_filename, std::ios::app); 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /AGAThA/src/args_parser.h: -------------------------------------------------------------------------------- 1 | #ifndef ARGS_PARSER_H 2 | #define ARGS_PARSER_H 3 | 4 | /* 5 | #include 6 | 7 | 8 | #include "gasal.h" 9 | */ 10 | #include 11 | #include 12 | #include "gasal.h" 13 | #include 14 | 15 | 16 | enum fail_type { 17 | NOT_ENOUGH_ARGS, 18 | TOO_MANY_ARGS, 19 | WRONG_ARG, 20 | WRONG_FILES, 21 | WRONG_ALGO 22 | }; 23 | 24 | class Parameters{ 25 | 26 | public: 27 | Parameters(int argc, char** argv); 28 | ~Parameters(); 29 | void print(); 30 | void failure(fail_type f); 31 | void help(); 32 | void parse(); 33 | void fileopen(); 34 | 35 | int32_t sa; 36 | int32_t sb; 37 | int32_t gapo; 38 | int32_t gape; 39 | 40 | int print_out; 41 | int n_threads; 42 | 43 | int slice_width; 44 | int z_threshold; 45 | int band_width; 46 | 47 | int32_t kernel_block_num; 48 | int32_t kernel_thread_num; 49 | int32_t kernel_align_num; 50 | 51 | bool isPacked; 52 | bool isReverseComplement; 53 | 54 | std::string query_batch_fasta_filename; 55 | std::string target_batch_fasta_filename; 56 | std::string raw_filename; 57 | 58 | std::ifstream query_batch_fasta; 59 | std::ifstream target_batch_fasta; 60 | std::ofstream raw_file; 61 | 62 | 63 | protected: 64 | 65 | private: 66 | int argc; 67 | char** argv; 68 | }; 69 | 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /AGAThA/src/ctors.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gasal.h" 3 | 4 | #include "args_parser.h" 5 | 6 | #include "host_batch.h" 7 | 8 | #include "res.h" 9 | 10 | #include "ctors.h" 11 | 12 | #include "interfaces.h" 13 | 14 | #include 15 | 16 | 17 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams) { 18 | gasal_gpu_storage_v v; 19 | v.a = (gasal_gpu_storage_t*)calloc(n_streams, sizeof(gasal_gpu_storage_t)); 20 | v.n = n_streams; 21 | return v; 22 | 23 | } 24 | 25 | 26 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec, int max_query_len, int max_target_len, int32_t maximum_sequence_length, Parameters *params) { 27 | 28 | cudaError_t err; 29 | int max_n_alns = params->kernel_align_num; 30 | int i; 31 | int max_query_len_8 = max_query_len % 8 ? max_query_len + (8 - (max_query_len % 8)) : max_query_len; 32 | int max_target_len_8 = max_target_len % 8 ? max_target_len + (8 - (max_target_len % 8)) : max_target_len; 33 | 34 | int host_max_query_batch_bytes = max_n_alns * max_query_len_8; 35 | int gpu_max_query_batch_bytes = max_n_alns * max_query_len_8; 36 | int host_max_target_batch_bytes = max_n_alns * max_target_len_8; 37 | int gpu_max_target_batch_bytes = max_n_alns * max_target_len_8; 38 | int host_max_n_alns = max_n_alns; 39 | int gpu_max_n_alns = max_n_alns; 40 | 41 | 42 | 43 | for (i = 0; i < gpu_storage_vec->n; i++) { 44 | 45 | gpu_storage_vec->a[i].extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0); 46 | gpu_storage_vec->a[i].extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0); 47 | 48 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t))); 49 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t))); 50 | 51 | 52 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault)); 53 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault)); 54 | uint8_t *no_ops = NULL; 55 | no_ops = (uint8_t*) calloc(host_max_n_alns * sizeof(uint8_t), sizeof(uint8_t)); 56 | gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, QUERY); 57 | gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, TARGET); 58 | free(no_ops); 59 | 60 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_op), gpu_max_n_alns * sizeof(uint8_t))); 61 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_op), gpu_max_n_alns * sizeof(uint8_t))); 62 | 63 | 64 | 65 | if (params->isPacked) 66 | { 67 | gpu_storage_vec->a[i].packed_query_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_query_batch; 68 | gpu_storage_vec->a[i].packed_target_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_target_batch; 69 | 70 | } else { 71 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 72 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 73 | } 74 | 75 | gpu_storage_vec->a[i].host_seed_scores = NULL; 76 | gpu_storage_vec->a[i].seed_scores = NULL; 77 | 78 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 79 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 80 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 81 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 82 | 83 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 84 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 85 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 86 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 87 | 88 | // For AGAThA 89 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].global_buffer), sizeof(short2)*(maximum_sequence_length*(params->kernel_thread_num/8)*(params->kernel_block_num)*3+(params->kernel_align_num)))); 90 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_buffer), sizeof(int32_t)*max_n_alns, cudaHostAllocDefault)); 91 | 92 | gpu_storage_vec->a[i].host_res = gasal_res_new_host(host_max_n_alns, params); 93 | gpu_storage_vec->a[i].device_cpy = gasal_res_new_device_cpy(max_n_alns, params); 94 | gpu_storage_vec->a[i].device_res = gasal_res_new_device(gpu_storage_vec->a[i].device_cpy); 95 | 96 | gpu_storage_vec->a[i].host_res_second = NULL; 97 | gpu_storage_vec->a[i].device_cpy_second = NULL; 98 | gpu_storage_vec->a[i].device_res_second = NULL; 99 | 100 | CHECKCUDAERROR(cudaStreamCreate(&(gpu_storage_vec->a[i].str))); 101 | gpu_storage_vec->a[i].is_free = 1; 102 | gpu_storage_vec->a[i].host_max_query_batch_bytes = host_max_query_batch_bytes; 103 | gpu_storage_vec->a[i].host_max_target_batch_bytes = host_max_target_batch_bytes; 104 | gpu_storage_vec->a[i].host_max_n_alns = host_max_n_alns; 105 | gpu_storage_vec->a[i].gpu_max_query_batch_bytes = gpu_max_query_batch_bytes; 106 | gpu_storage_vec->a[i].gpu_max_target_batch_bytes = gpu_max_target_batch_bytes; 107 | gpu_storage_vec->a[i].gpu_max_n_alns = gpu_max_n_alns; 108 | gpu_storage_vec->a[i].current_n_alns = 0; 109 | // For AGAThA 110 | gpu_storage_vec->a[i].slice_width = params->slice_width; 111 | gpu_storage_vec->a[i].maximum_sequence_length = maximum_sequence_length; 112 | } 113 | } 114 | 115 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params) { 116 | 117 | cudaError_t err; 118 | 119 | int i; 120 | for (i = 0; i < gpu_storage_vec->n; i ++) { 121 | 122 | gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_query_batch); 123 | gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_target_batch); 124 | 125 | gasal_res_destroy_host(gpu_storage_vec->a[i].host_res); 126 | gasal_res_destroy_device(gpu_storage_vec->a[i].device_res, gpu_storage_vec->a[i].device_cpy); 127 | 128 | if (gpu_storage_vec->a[i].seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].seed_scores)); 129 | if (gpu_storage_vec->a[i].host_seed_scores != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_seed_scores)); 130 | 131 | 132 | if (gpu_storage_vec->a[i].query_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_op)); 133 | if (gpu_storage_vec->a[i].target_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_op)); 134 | if (gpu_storage_vec->a[i].host_query_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_op)); 135 | if (gpu_storage_vec->a[i].host_target_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_op)); 136 | 137 | if (gpu_storage_vec->a[i].host_query_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_offsets)); 138 | if (gpu_storage_vec->a[i].host_target_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_offsets)); 139 | if (gpu_storage_vec->a[i].host_query_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_lens)); 140 | if (gpu_storage_vec->a[i].host_target_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_lens)); 141 | //if (gpu_storage_vec->a[i].host_res->cigar != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_res->cigar)); 142 | 143 | // For AGAThA 144 | if (gpu_storage_vec->a[i].global_buffer != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].global_buffer)); 145 | if (gpu_storage_vec->a[i].host_buffer != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_buffer)); 146 | 147 | if (gpu_storage_vec->a[i].unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_query_batch)); 148 | if (gpu_storage_vec->a[i].unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_target_batch)); 149 | if (!(params->isPacked)) 150 | { 151 | if (gpu_storage_vec->a[i].packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_query_batch)); 152 | if (gpu_storage_vec->a[i].packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_target_batch)); 153 | } 154 | 155 | 156 | if (gpu_storage_vec->a[i].query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_offsets)); 157 | if (gpu_storage_vec->a[i].target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_offsets)); 158 | if (gpu_storage_vec->a[i].query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_lens)); 159 | if (gpu_storage_vec->a[i].target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_lens)); 160 | if (gpu_storage_vec->a[i].packed_tb_matrices != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_tb_matrices)); 161 | 162 | if (gpu_storage_vec->a[i].str != NULL)CHECKCUDAERROR(cudaStreamDestroy(gpu_storage_vec->a[i].str)); 163 | } 164 | 165 | 166 | 167 | } 168 | 169 | 170 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec) { 171 | 172 | if(gpu_storage_vec->a != NULL) free(gpu_storage_vec->a); 173 | } 174 | 175 | 176 | 177 | 178 | // Deprecated 179 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, int gpu_max_n_alns, Parameters *params) { 180 | 181 | cudaError_t err; 182 | // if (gpu_storage->gpu_max_query_batch_bytes % 8) { 183 | // fprintf(stderr, "[GASAL ERROR:] max_query_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_query_batch_bytes % 8); 184 | // exit(EXIT_FAILURE); 185 | // } 186 | // if (gpu_storage->gpu_max_target_batch_bytes % 8) { 187 | // fprintf(stderr, "[GASAL ERROR:] max_target_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_target_batch_bytes % 8); 188 | // exit(EXIT_FAILURE); 189 | // } 190 | 191 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t))); 192 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t))); 193 | 194 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 195 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 196 | 197 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 198 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 199 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 200 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 201 | 202 | gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy); 203 | 204 | gpu_storage->gpu_max_query_batch_bytes = gpu_max_query_batch_bytes; 205 | gpu_storage->gpu_max_target_batch_bytes = gpu_max_target_batch_bytes; 206 | gpu_storage->gpu_max_n_alns = gpu_max_n_alns; 207 | 208 | } 209 | 210 | // Deprecated 211 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params) { 212 | 213 | cudaError_t err; 214 | 215 | if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch)); 216 | if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch)); 217 | if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch)); 218 | if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch)); 219 | if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets)); 220 | if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets)); 221 | if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens)); 222 | if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens)); 223 | 224 | gasal_res_destroy_device(gpu_storage->device_res,gpu_storage->device_cpy); 225 | 226 | } 227 | -------------------------------------------------------------------------------- /AGAThA/src/ctors.h: -------------------------------------------------------------------------------- 1 | #ifndef __CTORS_H__ 2 | #define __CTORS_H__ 3 | 4 | 5 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams); 6 | 7 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec, int max_query_len, int max_target_len, int32_t maximum_sequence_length, Parameters *params); 8 | 9 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, Parameters *params); 10 | 11 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params); 12 | 13 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params); 14 | 15 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /AGAThA/src/gasal.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_H__ 2 | #define __GASAL_H__ 3 | 4 | 5 | #include 6 | #include 7 | 8 | #include "/usr/local/cuda-11.7/targets/x86_64-linux/include/cuda_runtime.h" 9 | 10 | #ifndef HOST_MALLOC_SAFETY_FACTOR 11 | #define HOST_MALLOC_SAFETY_FACTOR 5 12 | #endif 13 | 14 | #define CHECKCUDAERROR(error) \ 15 | do{\ 16 | err = error;\ 17 | if (cudaSuccess != err ) { \ 18 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err, __LINE__, __FILE__); \ 19 | exit(EXIT_FAILURE);\ 20 | }\ 21 | }while(0)\ 22 | 23 | 24 | inline int CudaCheckKernelLaunch() 25 | { 26 | cudaError err = cudaGetLastError(); 27 | if ( cudaSuccess != err ) 28 | { 29 | return -1; 30 | } 31 | 32 | return 0; 33 | } 34 | 35 | 36 | enum comp_start{ 37 | WITHOUT_START, 38 | WITH_START, 39 | WITH_TB 40 | }; 41 | 42 | // Generic enum for ture/false. Using this instead of bool to generalize templates out of Int values for secondBest. 43 | // Can be usd more generically, for example for WITH_/WITHOUT_START. 44 | enum Bool{ 45 | FALSE, 46 | TRUE 47 | }; 48 | 49 | enum data_source{ 50 | NONE, 51 | QUERY, 52 | TARGET, 53 | BOTH 54 | }; 55 | 56 | enum algo_type{ 57 | UNKNOWN, 58 | GLOBAL, 59 | SEMI_GLOBAL, 60 | LOCAL, 61 | MICROLOCAL, 62 | BANDED, 63 | KSW 64 | }; 65 | 66 | enum operation_on_seq{ 67 | FORWARD_NATURAL, 68 | REVERSE_NATURAL, 69 | FORWARD_COMPLEMENT, 70 | REVERSE_COMPLEMENT, 71 | }; 72 | 73 | // data structure of linked list to allow extension of memory on host side. 74 | struct host_batch{ 75 | uint8_t *data; 76 | uint32_t page_size; 77 | uint32_t data_size; 78 | uint32_t offset; 79 | int is_locked; 80 | struct host_batch* next; 81 | }; 82 | typedef struct host_batch host_batch_t; 83 | 84 | // Data structure to hold results. Can be instantiated for host or device memory (see res.cpp) 85 | struct gasal_res{ 86 | int32_t *aln_score; 87 | int32_t *query_batch_end; 88 | int32_t *target_batch_end; 89 | int32_t *query_batch_start; 90 | int32_t *target_batch_start; 91 | uint8_t *cigar; 92 | uint32_t *n_cigar_ops; 93 | }; 94 | typedef struct gasal_res gasal_res_t; 95 | 96 | //stream data 97 | typedef struct { 98 | uint8_t *unpacked_query_batch; 99 | uint8_t *unpacked_target_batch; 100 | uint32_t *packed_query_batch; 101 | uint32_t *packed_target_batch; 102 | uint32_t *query_batch_offsets; 103 | uint32_t *target_batch_offsets; 104 | uint32_t *query_batch_lens; 105 | uint32_t *target_batch_lens; 106 | 107 | uint32_t *host_seed_scores; 108 | uint32_t *seed_scores; 109 | 110 | host_batch_t *extensible_host_unpacked_query_batch; 111 | host_batch_t *extensible_host_unpacked_target_batch; 112 | 113 | uint8_t *host_query_op; 114 | uint8_t *host_target_op; 115 | uint8_t *query_op; 116 | uint8_t *target_op; 117 | 118 | uint32_t *host_query_batch_offsets; 119 | uint32_t *host_target_batch_offsets; 120 | uint32_t *host_query_batch_lens; 121 | uint32_t *host_target_batch_lens; 122 | 123 | gasal_res_t *host_res; // the results that can be read on host - THE STRUCT IS ON HOST SIDE, ITS CONTENT IS ON HOST SIDE. 124 | gasal_res_t *device_cpy; // a struct that contains the pointers to the device side - THE STRUCT IS ON HOST SIDE, but the CONTENT is malloc'd on and points to the DEVICE SIDE 125 | gasal_res_t *device_res; // the results that are written on device - THE STRUCT IS ON DEVICE SIDE, ITS CONTENT POINTS TO THE DEVICE SIDE. 126 | 127 | gasal_res_t *host_res_second; 128 | gasal_res_t *device_res_second; 129 | gasal_res_t *device_cpy_second; 130 | 131 | uint32_t gpu_max_query_batch_bytes; 132 | uint32_t gpu_max_target_batch_bytes; 133 | 134 | uint32_t host_max_query_batch_bytes; 135 | uint32_t host_max_target_batch_bytes; 136 | 137 | uint32_t gpu_max_n_alns; 138 | uint32_t host_max_n_alns; 139 | uint32_t current_n_alns; 140 | 141 | uint64_t packed_tb_matrix_size; 142 | uint4 *packed_tb_matrices; 143 | 144 | //for AGAThA 145 | int32_t slice_width; 146 | uint32_t maximum_sequence_length; 147 | short2 *global_buffer; 148 | short2 *host_buffer; 149 | 150 | 151 | cudaStream_t str; 152 | int is_free; 153 | int id; //this can be useful in cases where a gasal_gpu_storage only contains PARTS of an alignment (like a seed-extension...), to gather results. 154 | 155 | } gasal_gpu_storage_t; 156 | 157 | //vector of streams 158 | typedef struct { 159 | int n; 160 | gasal_gpu_storage_t *a; 161 | }gasal_gpu_storage_v; 162 | 163 | 164 | //match/mismatch and gap penalties 165 | typedef struct{ 166 | int32_t match; 167 | int32_t mismatch; 168 | int32_t gap_open; 169 | int32_t gap_extend; 170 | int32_t slice_width; 171 | int32_t z_threshold; 172 | int32_t band_width; 173 | } gasal_subst_scores; 174 | 175 | 176 | #endif 177 | -------------------------------------------------------------------------------- /AGAThA/src/gasal_align.cu: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | #include "args_parser.h" 3 | #include "res.h" 4 | #include "gasal_align.h" 5 | #include "gasal_kernels.h" 6 | #include "host_batch.h" 7 | #include 8 | #include 9 | 10 | inline void agatha_kernel_launcher(int32_t kernel_block_num, int32_t kernel_thread_num, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns) 11 | { 12 | 13 | /*Sort for Uneven Bucketing*/ 14 | agatha_sort<<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, actual_n_alns, gpu_storage->maximum_sequence_length, gpu_storage->global_buffer); 15 | cudaMemcpyAsync((void*)(gpu_storage->host_buffer), (const void*)(gpu_storage->global_buffer+kernel_block_num*(kernel_thread_num/8)*(gpu_storage->maximum_sequence_length)*3), actual_n_alns * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpu_storage->str); 16 | cudaStreamSynchronize(gpu_storage->str); 17 | std::sort(gpu_storage->host_buffer, gpu_storage->host_buffer+actual_n_alns, [](short2 a, short2 b){ return a.xglobal_buffer+kernel_block_num*(kernel_thread_num/8)*(gpu_storage->maximum_sequence_length)*3), (const void*)(gpu_storage->host_buffer), actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str); 19 | 20 | agatha_kernel<<slice_width+1)))+28)*sizeof(int32_t), gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->device_res_second, gpu_storage->packed_tb_matrices, actual_n_alns, gpu_storage->maximum_sequence_length, gpu_storage->global_buffer); 21 | 22 | 23 | } 24 | 25 | 26 | //GASAL2 asynchronous (a.k.a non-blocking) alignment function 27 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params) { 28 | 29 | int32_t kernel_block_num = params->kernel_block_num; 30 | int32_t kernel_thread_num = params->kernel_thread_num; 31 | 32 | cudaError_t err; 33 | if (actual_n_alns <= 0) { 34 | fprintf(stderr, "[GASAL ERROR:] actual_n_alns <= 0\n"); 35 | exit(EXIT_FAILURE); 36 | } 37 | if (actual_query_batch_bytes <= 0) { 38 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes <= 0\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | if (actual_target_batch_bytes <= 0) { 42 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes <= 0\n"); 43 | exit(EXIT_FAILURE); 44 | } 45 | 46 | if (actual_query_batch_bytes % 8) { 47 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes=%d is not a multiple of 8\n", actual_query_batch_bytes); 48 | exit(EXIT_FAILURE); 49 | } 50 | if (actual_target_batch_bytes % 8) { 51 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes=%d is not a multiple of 8\n", actual_target_batch_bytes); 52 | exit(EXIT_FAILURE); 53 | } 54 | 55 | if (actual_query_batch_bytes > gpu_storage->host_max_query_batch_bytes) { 56 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes(%d) > host_max_query_batch_bytes(%d)\n", actual_query_batch_bytes, gpu_storage->host_max_query_batch_bytes); 57 | exit(EXIT_FAILURE); 58 | } 59 | 60 | if (actual_target_batch_bytes > gpu_storage->host_max_target_batch_bytes) { 61 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes(%d) > host_max_target_batch_bytes(%d)\n", actual_target_batch_bytes, gpu_storage->host_max_target_batch_bytes); 62 | exit(EXIT_FAILURE); 63 | } 64 | 65 | if (actual_n_alns > gpu_storage->host_max_n_alns) { 66 | fprintf(stderr, "[GASAL ERROR:] actual_n_alns(%d) > host_max_n_alns(%d)\n", actual_n_alns, gpu_storage->host_max_n_alns); 67 | exit(EXIT_FAILURE); 68 | } 69 | 70 | //--------------if pre-allocated memory is less, allocate more-------------------------- 71 | if (gpu_storage->gpu_max_query_batch_bytes < actual_query_batch_bytes) { 72 | 73 | int i = 2; 74 | while ( (gpu_storage->gpu_max_query_batch_bytes * i) < actual_query_batch_bytes) i++; 75 | 76 | //fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated GPU memory (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i); 77 | 78 | gpu_storage->gpu_max_query_batch_bytes = gpu_storage->gpu_max_query_batch_bytes * i; 79 | 80 | if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch)); 81 | if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch)); 82 | 83 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t))); 84 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_storage->gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 85 | } 86 | 87 | if (gpu_storage->gpu_max_target_batch_bytes < actual_target_batch_bytes) { 88 | 89 | int i = 2; 90 | while ( (gpu_storage->gpu_max_target_batch_bytes * i) < actual_target_batch_bytes) i++; 91 | 92 | //fprintf(stderr, "[GASAL WARNING:] actual_target_batch_bytes(%d) > Allocated GPU memory (gpu_max_target_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_target_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes*i, gpu_storage->gpu_max_target_batch_bytes*i); 93 | 94 | gpu_storage->gpu_max_target_batch_bytes = gpu_storage->gpu_max_target_batch_bytes * i; 95 | 96 | if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch)); 97 | if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch)); 98 | 99 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_storage->gpu_max_target_batch_bytes * sizeof(uint8_t))); 100 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_storage->gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 101 | 102 | 103 | } 104 | 105 | if (gpu_storage->gpu_max_n_alns < actual_n_alns) { 106 | 107 | int i = 2; 108 | while ( (gpu_storage->gpu_max_n_alns * i) < actual_n_alns) i++; 109 | 110 | //fprintf(stderr, "[GASAL WARNING:] actual_n_alns(%d) > gpu_max_n_alns(%d). Therefore, allocating memory for %d alignments on GPU (gpu_max_n_alns=%d). Performance may be lost if this is repeated many times.\n", actual_n_alns, gpu_storage->gpu_max_n_alns, gpu_storage->gpu_max_n_alns*i, gpu_storage->gpu_max_n_alns*i); 111 | 112 | gpu_storage->gpu_max_n_alns = gpu_storage->gpu_max_n_alns * i; 113 | 114 | if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets)); 115 | if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets)); 116 | if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens)); 117 | if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens)); 118 | 119 | if (gpu_storage->seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->seed_scores)); 120 | 121 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 122 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 123 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 124 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 125 | 126 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->seed_scores), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 127 | 128 | gasal_res_destroy_device(gpu_storage->device_res, gpu_storage->device_cpy); 129 | gpu_storage->device_cpy = gasal_res_new_device_cpy(gpu_storage->gpu_max_n_alns, params); 130 | gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy); 131 | 132 | 133 | } 134 | //------------------------------------------ 135 | 136 | //------------------------launch copying of sequence batches from CPU to GPU--------------------------- 137 | 138 | // here you can track the evolution of your data structure processing with the printer: gasal_host_batch_printall(current); 139 | 140 | host_batch_t *current = gpu_storage->extensible_host_unpacked_query_batch; 141 | while (current != NULL) 142 | { 143 | //gasal_host_batch_printall(current); 144 | CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_query_batch[current->offset]), 145 | current->data, 146 | current->data_size, 147 | cudaMemcpyHostToDevice, 148 | gpu_storage->str ) ); 149 | 150 | current = current->next; 151 | } 152 | 153 | current = gpu_storage->extensible_host_unpacked_target_batch; 154 | while (current != NULL) 155 | { 156 | CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_target_batch[current->offset]), 157 | current->data, 158 | current->data_size, 159 | cudaMemcpyHostToDevice, 160 | gpu_storage->str ) ); 161 | 162 | current = current->next; 163 | } 164 | 165 | //----------------------------------------------------------------------------------------------------------- 166 | 167 | int query_batch_tasks_per_thread = (int)ceil((double)actual_query_batch_bytes/(8*kernel_thread_num*kernel_block_num)); 168 | int target_batch_tasks_per_thread = (int)ceil((double)actual_target_batch_bytes/(8*kernel_thread_num*kernel_block_num)); 169 | 170 | 171 | //-------------------------------------------launch packing kernel 172 | 173 | 174 | if (!(params->isPacked)) 175 | { 176 | gasal_pack_kernel<<str>>>((uint32_t*)(gpu_storage->unpacked_query_batch), 177 | (uint32_t*)(gpu_storage->unpacked_target_batch), gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, 178 | query_batch_tasks_per_thread, target_batch_tasks_per_thread, actual_query_batch_bytes/4, actual_target_batch_bytes/4); 179 | cudaError_t pack_kernel_err = cudaGetLastError(); 180 | if ( cudaSuccess != pack_kernel_err ) 181 | { 182 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(pack_kernel_err), pack_kernel_err, __LINE__, __FILE__); 183 | exit(EXIT_FAILURE); 184 | } 185 | } 186 | 187 | 188 | // We could reverse-complement before packing, but we would get 2x more read-writes to memory. 189 | 190 | //----------------------launch copying of sequence offsets and lengths from CPU to GPU-------------------------------------- 191 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_lens, gpu_storage->host_query_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 192 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_lens, gpu_storage->host_target_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 193 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_offsets, gpu_storage->host_query_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 194 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_offsets, gpu_storage->host_target_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 195 | 196 | //-------------------------------------------------------------------------------------------------------------------------- 197 | 198 | //----------------------launch copying of sequence operations (reverse/complement) from CPU to GPU-------------------------- 199 | if (params->isReverseComplement) 200 | { 201 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_op, gpu_storage->host_query_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice, gpu_storage->str)); 202 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_op, gpu_storage->host_target_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice, gpu_storage->str)); 203 | //--------------------------------------launch reverse-complement kernel------------------------------------------------------ 204 | gasal_reversecomplement_kernel<<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 205 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->query_op, gpu_storage->target_op, actual_n_alns); 206 | cudaError_t reversecomplement_kernel_err = cudaGetLastError(); 207 | if ( cudaSuccess != reversecomplement_kernel_err ) 208 | { 209 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(reversecomplement_kernel_err), reversecomplement_kernel_err, __LINE__, __FILE__); 210 | exit(EXIT_FAILURE); 211 | } 212 | 213 | } 214 | 215 | //--------------------------------------launch alignment kernels-------------------------------------------------------------- 216 | 217 | cudaFuncSetAttribute(agatha_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, (kernel_thread_num/32)*((32*(8*(gpu_storage->slice_width+1)))+28)*sizeof(int32_t)); 218 | 219 | if (params->print_out) { 220 | float mill = 0; 221 | cudaEvent_t begin, end; 222 | cudaEventCreate(&begin); 223 | cudaEventCreate(&end); 224 | cudaEventRecord(begin); 225 | 226 | agatha_kernel_launcher(params->kernel_block_num, params->kernel_thread_num, gpu_storage, actual_n_alns); 227 | 228 | cudaDeviceSynchronize(); 229 | cudaEventRecord(end); 230 | cudaEventSynchronize(end); 231 | 232 | cudaEventElapsedTime(&mill, begin, end); 233 | params->raw_file << mill << std::endl; 234 | 235 | cudaEventDestroy(begin); 236 | cudaEventDestroy(end); 237 | } else { 238 | agatha_kernel_launcher(params->kernel_block_num, params->kernel_thread_num, gpu_storage, actual_n_alns); 239 | } 240 | 241 | 242 | 243 | 244 | //----------------------------------------------------------------------------------------------------------------------- 245 | cudaError_t aln_kernel_err = cudaGetLastError(); 246 | if ( cudaSuccess != aln_kernel_err ) 247 | { 248 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err, __LINE__, __FILE__); 249 | exit(EXIT_FAILURE); 250 | } 251 | 252 | //------------------------0launch the copying of alignment results from GPU to CPU-------------------------------------- 253 | if (gpu_storage->host_res->aln_score != NULL && gpu_storage->device_cpy->aln_score != NULL) 254 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->aln_score, gpu_storage->device_cpy->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 255 | 256 | if (gpu_storage->host_res->query_batch_start != NULL && gpu_storage->device_cpy->query_batch_start != NULL) 257 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_start, gpu_storage->device_cpy->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 258 | 259 | if (gpu_storage->host_res->target_batch_start != NULL && gpu_storage->device_cpy->target_batch_start != NULL) 260 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_start, gpu_storage->device_cpy->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 261 | 262 | if (gpu_storage->host_res->query_batch_end != NULL && gpu_storage->device_cpy->query_batch_end != NULL) 263 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_end, gpu_storage->device_cpy->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 264 | 265 | if (gpu_storage->host_res->target_batch_end != NULL && gpu_storage->device_cpy->target_batch_end != NULL) 266 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_end, gpu_storage->device_cpy->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 267 | 268 | //----------------------------------------------------------------------------------------------------------------------- 269 | 270 | 271 | 272 | gpu_storage->is_free = 0; //set the availability of current stream to false 273 | } 274 | 275 | 276 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage) 277 | { 278 | cudaError_t err; 279 | if(gpu_storage->is_free == 1) return -2;//if no work is launced in this stream, return -2 280 | err = cudaStreamQuery(gpu_storage->str);//check to see if the stream is finished 281 | if (err != cudaSuccess ) { 282 | if (err == cudaErrorNotReady) return -1; 283 | else{ 284 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err, __LINE__, __FILE__); 285 | exit(EXIT_FAILURE); 286 | } 287 | } 288 | gasal_host_batch_reset(gpu_storage); 289 | gpu_storage->is_free = 1; 290 | gpu_storage->current_n_alns = 0; 291 | return 0; 292 | } 293 | 294 | 295 | void gasal_copy_subst_scores(gasal_subst_scores *subst){ 296 | 297 | cudaError_t err; 298 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapO, &(subst->gap_open), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 299 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapExtend, &(subst->gap_extend), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 300 | int32_t gapoe = (subst->gap_open + subst->gap_extend); 301 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapOE, &(gapoe), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 302 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMatchScore, &(subst->match), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 303 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMismatchScore, &(subst->mismatch), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 304 | // For AGAThA 305 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaSliceWidth, &(subst->slice_width), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 306 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaZThreshold, &(subst->z_threshold), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 307 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaBandWidth, &(subst->band_width), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 308 | return; 309 | } 310 | 311 | -------------------------------------------------------------------------------- /AGAThA/src/gasal_align.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_ALIGN_H__ 2 | #define __GASAL_ALIGN_H__ 3 | 4 | void gasal_copy_subst_scores(gasal_subst_scores *subst); 5 | 6 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params); 7 | 8 | inline void gasal_kernel_launcher(int32_t kernel_block_num, int32_t kernel_thread_num, algo_type algo, comp_start start, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns, int32_t k_band, uint32_t maximum_sequence_length); 9 | 10 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /AGAThA/src/gasal_header.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_HEADER_H__ 2 | #define __GASAL_HEADER_H__ 3 | 4 | 5 | #include "gasal.h" // include cstdlib, cstdint 6 | #include "args_parser.h" // include iostream, string, fstream 7 | #include "gasal_align.h" 8 | #include "host_batch.h" // include cstdio, cstring 9 | #include "ctors.h" 10 | #include "interfaces.h" 11 | 12 | 13 | 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /AGAThA/src/gasal_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_KERNELS_H__ 2 | #define __GASAL_KERNELS_H__ 3 | 4 | 5 | // Template-meta-programming types construction from Int values 6 | // This allows to cut down kernel code at compilation time. 7 | 8 | template 9 | struct Int2Type 10 | { 11 | typedef enum {val_ = Val} val__; 12 | }; 13 | 14 | template 15 | struct SameType 16 | { 17 | enum { result = 0 }; 18 | }; 19 | 20 | template 21 | struct SameType 22 | { 23 | enum { result = 1 }; 24 | }; 25 | 26 | #define SAMETYPE(a, b) (SameType::result) 27 | 28 | 29 | __constant__ int32_t _cudaGapO; /*gap open penalty*/ 30 | __constant__ int32_t _cudaGapOE; /*sum of gap open and extension penalties*/ 31 | __constant__ int32_t _cudaGapExtend; /*sum of gap extend*/ 32 | __constant__ int32_t _cudaMatchScore; /*score for a match*/ 33 | __constant__ int32_t _cudaMismatchScore; /*penalty for a mismatch*/ 34 | __constant__ int32_t _cudaSliceWidth; /*(AGAThA) slice width*/ 35 | __constant__ int32_t _cudaZThreshold; /*(AGAThA) zdrop threshold*/ 36 | __constant__ int32_t _cudaBandWidth; /*(AGAThA) band width*/ 37 | 38 | #define MINUS_INF SHRT_MIN 39 | #define MINUS_INF2 SHRT_MIN/2 40 | 41 | #define N_VALUE (N_CODE & 0xF) 42 | 43 | #ifdef N_PENALTY 44 | #define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \ 45 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 46 | score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\ 47 | 48 | #define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \ 49 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 50 | score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\ 51 | 52 | #else 53 | #define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \ 54 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 55 | score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? 0 : score;\ 56 | 57 | #define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \ 58 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 59 | 60 | #endif 61 | 62 | #define MAX(a,b) ((a)>(b)?(a):(b)) 63 | #define MIN(a,b) ((a)<(b)?(a):(b)) 64 | 65 | 66 | #define FIND_MAX(curr, gidx) \ 67 | maxXY_y = (maxHH < curr) ? gidx : maxXY_y;\ 68 | maxHH = (maxHH < curr) ? curr : maxHH; 69 | 70 | 71 | // Kernel files 72 | 73 | #include "kernels/pack_rc_seqs.h" 74 | 75 | #include "kernels/agatha_kernel.h" 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /AGAThA/src/host_batch.cpp: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | #include "args_parser.h" 3 | #include "interfaces.h" 4 | #include "host_batch.h" 5 | 6 | 7 | 8 | 9 | // Functions for host batches handling. 10 | 11 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset) 12 | { 13 | cudaError_t err; 14 | host_batch_t *res = (host_batch_t *)calloc(1, sizeof(host_batch_t)); 15 | CHECKCUDAERROR(cudaHostAlloc(&(res->data), batch_bytes*sizeof(uint8_t), cudaHostAllocDefault)); 16 | res->page_size = batch_bytes; 17 | res->data_size = 0; 18 | res->is_locked = 0; 19 | res->offset = offset; 20 | res->next = NULL; 21 | return res; 22 | } 23 | 24 | void gasal_host_batch_destroy(host_batch_t *res) 25 | { 26 | cudaError_t err; 27 | if (res==NULL) 28 | { 29 | fprintf(stderr, "[GASAL ERROR] Trying to free a NULL pointer\n"); 30 | exit(1); 31 | } 32 | // recursive function to destroy all the linked listgasal_res_destroy_host 33 | if (res->next != NULL) 34 | gasal_host_batch_destroy(res->next); 35 | if (res->data != NULL) 36 | { 37 | CHECKCUDAERROR(cudaFreeHost(res->data)); 38 | } 39 | 40 | free(res); 41 | } 42 | 43 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg) 44 | { 45 | return (arg->next == NULL ? arg : gasal_host_batch_getlast(arg->next) ); 46 | 47 | } 48 | 49 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage) 50 | { 51 | // reset all batch idx and data occupation 52 | host_batch_t *cur_page = NULL; 53 | for(int i = 0; i < 2; i++) { 54 | 55 | switch(i) { 56 | case 0: 57 | cur_page = (gpu_storage->extensible_host_unpacked_query_batch); 58 | break; 59 | case 1: 60 | cur_page = (gpu_storage->extensible_host_unpacked_target_batch); 61 | break; 62 | default: 63 | break; 64 | } 65 | while(cur_page != NULL) 66 | { 67 | cur_page->data_size = 0; 68 | cur_page->offset = 0; 69 | cur_page->is_locked = 0; 70 | cur_page = cur_page->next; 71 | } 72 | } 73 | //fprintf(stderr, "[GASAL INFO] Batch reset.\n"); 74 | 75 | } 76 | 77 | 78 | // TODO: make a template... now that you started to go the C++/template way, just stick to it. 79 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC) 80 | { 81 | // since query and target are very symmetric here, we use pointers to route the data where it has to, 82 | // while keeping the actual memory management 'source-agnostic'. 83 | 84 | host_batch_t *cur_page = NULL; 85 | uint32_t *p_batch_bytes = NULL; 86 | 87 | switch(SRC) { 88 | case QUERY: 89 | cur_page = (gpu_storage->extensible_host_unpacked_query_batch); 90 | p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes); 91 | break; 92 | case TARGET: 93 | cur_page = (gpu_storage->extensible_host_unpacked_target_batch); 94 | p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes); 95 | break; 96 | default: 97 | break; 98 | } 99 | 100 | int nbr_N = 0; 101 | while((size+nbr_N)%8) 102 | nbr_N++; 103 | 104 | while(cur_page->is_locked) 105 | cur_page = cur_page->next; 106 | 107 | if (cur_page->next == NULL && cur_page->page_size - cur_page->data_size < size + nbr_N) 108 | { 109 | /* 110 | fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes while only %d remain (%s) (block size %d, filled %d bytes).\n Allocating a new block of size %d, total size available reaches %d. Doing this repeadtedly slows down the execution.\n", 111 | size + nbr_N, 112 | cur_page->page_size - cur_page->data_size, 113 | (SRC == QUERY ? "query":"target"), 114 | cur_page->page_size, 115 | cur_page->data_size, 116 | cur_page->page_size * 2, 117 | *p_batch_bytes + cur_page->page_size * 2); 118 | */ 119 | host_batch_t *res = gasal_host_batch_new(cur_page->page_size * 2, cur_page->offset + cur_page->data_size); 120 | cur_page->next = res; 121 | cur_page->is_locked = 1; 122 | *p_batch_bytes = *p_batch_bytes + cur_page->page_size * 2; 123 | 124 | cur_page = cur_page->next; 125 | //fprintf(stderr, "CREATED: "); gasal_host_batch_print(cur_page); 126 | } 127 | 128 | if (cur_page->next != NULL && cur_page->page_size - cur_page->data_size < size + nbr_N) 129 | { 130 | // re-write offset for the next page to correspond to what has been filled on the current page. 131 | cur_page->next->offset = cur_page->offset + cur_page->data_size; 132 | cur_page->is_locked = 1; 133 | // then, jump to next page 134 | cur_page = cur_page->next; 135 | } 136 | 137 | 138 | if (cur_page->page_size - cur_page->data_size >= size + nbr_N) 139 | { 140 | // fprintf(stderr, "FILL: "); gasal_host_batch_print(cur_page); 141 | memcpy(&(cur_page->data[idx - cur_page->offset]), data, size); 142 | 143 | for(int i = 0; i < nbr_N; i++) 144 | { 145 | cur_page->data[idx + size - cur_page->offset + i] = N_CODE; 146 | } 147 | idx = idx + size + nbr_N; 148 | 149 | cur_page->data_size += size + nbr_N; 150 | //is_done = 1; 151 | } 152 | 153 | return idx; 154 | } 155 | 156 | 157 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC ) 158 | { 159 | return gasal_host_batch_add(gpu_storage, idx, &base, 1, SRC ); 160 | } 161 | 162 | 163 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC ) 164 | { 165 | 166 | // since query and target are very symmetric here, we use pointers to route the data where it has to, 167 | // while keeping the actual memory management 'source-agnostic'. 168 | host_batch_t *cur_page = NULL; 169 | uint32_t *p_batch_bytes = NULL; 170 | 171 | 172 | switch(SRC) { 173 | case QUERY: 174 | cur_page = (gpu_storage->extensible_host_unpacked_query_batch); 175 | p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes); 176 | break; 177 | case TARGET: 178 | cur_page = (gpu_storage->extensible_host_unpacked_target_batch); 179 | p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes); 180 | break; 181 | default: 182 | break; 183 | } 184 | 185 | int is_done = 0; 186 | 187 | while (!is_done) 188 | { 189 | if (*p_batch_bytes >= idx + size && (cur_page->next == NULL || (cur_page->next->offset >= idx + size)) ) 190 | { 191 | 192 | memcpy(&(cur_page->data[idx - cur_page->offset]), data, size); 193 | idx = idx + size; 194 | is_done = 1; 195 | 196 | } else if ((*p_batch_bytes >= idx + size) && (cur_page->next != NULL) && (cur_page->next->offset < idx + size)) { 197 | 198 | cur_page = cur_page->next; 199 | 200 | } else { 201 | /* 202 | fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes at position %d on host memory (%s) while only %d bytes are available. Therefore, allocating %d bytes more on CPU. Repeating this many times can provoke a degradation of performance.\n", 203 | size, 204 | idx, 205 | (SRC == QUERY ? "query":"target"), 206 | *p_batch_bytes, 207 | *p_batch_bytes * 2); 208 | */ 209 | 210 | *p_batch_bytes += *p_batch_bytes; 211 | 212 | // corner case: if we allocated less than a single sequence length to begin with... it shouldn't be allowed actually, but at least it's caught here. 213 | while (*p_batch_bytes < size) 214 | *p_batch_bytes += *p_batch_bytes; 215 | 216 | host_batch_t *res = gasal_host_batch_new(*p_batch_bytes, idx); 217 | 218 | cur_page->next = res; 219 | 220 | cur_page = cur_page->next; 221 | } 222 | } 223 | //gasal_host_batch_printall(gasal_host_batch_getlast(cur_page)); 224 | return idx; 225 | } 226 | 227 | 228 | 229 | // this printer displays the whole sequence. It is heavy and shouldn't be called when you have more than a couple sequences. 230 | void gasal_host_batch_print(host_batch_t *res) 231 | { 232 | fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 233 | res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size); 234 | } 235 | 236 | // this printer allows to see the linked list easily. 237 | void gasal_host_batch_printall(host_batch_t *res) 238 | { 239 | fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 240 | res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size); 241 | if (res->next != NULL) 242 | { 243 | fprintf(stderr, "+--->"); 244 | gasal_host_batch_printall(res->next); 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /AGAThA/src/host_batch.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOST_BACTH_H__ 2 | #define __HOST_BACTH_H__ 3 | 4 | #include 5 | #include 6 | #include // useful for memcpy, strlen 7 | 8 | // host data structure methods 9 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset); 10 | void gasal_host_batch_destroy(host_batch_t *res); // destructor 11 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg); 12 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage); // get last item of chain 13 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC); // fill the data 14 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC ); 15 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC ); 16 | void gasal_host_batch_print(host_batch_t *res); // printer 17 | void gasal_host_batch_printall(host_batch_t *res); // printer for the whole linked list 18 | 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /AGAThA/src/interfaces.cpp: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | #include "args_parser.h" 3 | #include "interfaces.h" 4 | #include "res.h" 5 | 6 | 7 | // Function for general resizing 8 | template 9 | T* cudaHostRealloc(void *source, int new_size, int old_size) 10 | { 11 | cudaError_t err; 12 | T* destination = NULL; 13 | if (new_size < old_size) 14 | { 15 | fprintf(stderr, "[GASAL ERROR] cudoHostRealloc: invalid sizes. New size < old size (%d < %d)", new_size, old_size); 16 | exit(EXIT_FAILURE); 17 | } 18 | CHECKCUDAERROR(cudaHostAlloc(&destination, new_size * sizeof(T), cudaHostAllocMapped)); 19 | //fprintf(stderr, "\ndest=%p\tsrc=%p", destination, source); 20 | CHECKCUDAERROR(cudaMemcpy(destination, source, old_size * sizeof(T), cudaMemcpyHostToHost)); 21 | CHECKCUDAERROR(cudaFreeHost(source)); 22 | return destination; 23 | }; 24 | 25 | // Realloc new fields when more alignments are added. 26 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params) 27 | { 28 | /* // Don't reallocate the extensible batches. They're extensible. 29 | gpu_storage->extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0); 30 | gpu_storage->extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0); 31 | */ 32 | /* // don't realloc gpu-sided batches as they will be taken care of before aligning. 33 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t))); 34 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t))); 35 | */ 36 | 37 | fprintf(stderr, "[GASAL WARNING] Resizing gpu_storage from %d sequences to %d sequences... ", gpu_storage->host_max_n_alns,new_max_alns); 38 | // don't care about realloc'ing gpu-sided fields as they will be taken care of before aligning. 39 | 40 | gpu_storage->host_query_op = cudaHostRealloc((void*) gpu_storage->host_query_op, new_max_alns, gpu_storage->host_max_n_alns); 41 | gpu_storage->host_target_op = cudaHostRealloc((void*) gpu_storage->host_target_op, new_max_alns, gpu_storage->host_max_n_alns); 42 | 43 | gpu_storage->host_query_batch_lens = cudaHostRealloc((void*) gpu_storage->host_query_batch_lens, new_max_alns, gpu_storage->host_max_n_alns); 44 | gpu_storage->host_target_batch_lens = cudaHostRealloc((void*) gpu_storage->host_target_batch_lens, new_max_alns, gpu_storage->host_max_n_alns); 45 | //fprintf(stderr, "_lens done "); 46 | 47 | gpu_storage->host_query_batch_offsets = cudaHostRealloc((void*) gpu_storage->host_query_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns); 48 | gpu_storage->host_target_batch_offsets = cudaHostRealloc((void*) gpu_storage->host_target_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns); 49 | //fprintf(stderr, "_offsets done "); 50 | 51 | gasal_res_destroy_host(gpu_storage->host_res); 52 | gpu_storage->host_res = gasal_res_new_host(new_max_alns, params); 53 | gpu_storage->device_cpy = gasal_res_new_device_cpy(new_max_alns, params); 54 | gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy); 55 | 56 | gpu_storage->host_res_second = NULL; 57 | gpu_storage->device_cpy_second = NULL; 58 | gpu_storage->device_res_second = NULL; 59 | 60 | 61 | //fprintf(stderr, "_res done "); 62 | 63 | gpu_storage->host_max_n_alns = new_max_alns; 64 | //gpu_storage->gpu_max_n_alns = gpu_max_n_alns; 65 | fprintf(stderr, " done. This can harm performance.\n"); 66 | } 67 | 68 | // operation (Reverse/complement) filler. 69 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC) 70 | { 71 | uint8_t *host_op = NULL; 72 | switch(SRC) 73 | { 74 | case QUERY: 75 | host_op = (gpu_storage_t->host_query_op); 76 | break; 77 | case TARGET: 78 | host_op = (gpu_storage_t->host_target_op); 79 | break; 80 | default: 81 | break; 82 | } 83 | memcpy(host_op, data, nbr_seqs_in_stream); 84 | } 85 | 86 | void gasal_set_device(int gpu_select, bool isPrintingProp) 87 | { 88 | /* 89 | Select GPU 90 | */ 91 | if (isPrintingProp) 92 | { 93 | int num_devices, device; 94 | cudaGetDeviceCount(&num_devices); 95 | fprintf(stderr, "Found %d GPUs\n", num_devices); 96 | if (gpu_select > num_devices-1) 97 | { 98 | fprintf(stderr, "Error: can't select device %d when only %d devices are selected (range from 0 to %d)\n", gpu_select, num_devices, num_devices-1); 99 | exit(EXIT_FAILURE); 100 | } 101 | if (num_devices > 0) { 102 | cudaDeviceProp properties; 103 | for (device = 0; device < num_devices; device++) { 104 | cudaGetDeviceProperties(&properties, device); 105 | fprintf(stderr, "\tGPU %d: %s\n", device, properties.name); 106 | } 107 | cudaGetDeviceProperties(&properties, gpu_select); 108 | fprintf(stderr, "Selected device %d : %s\n", gpu_select, properties.name); 109 | cudaSetDevice(gpu_select); 110 | } 111 | } else { 112 | // silently select device 113 | cudaSetDevice(gpu_select); 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /AGAThA/src/interfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_INTERFACES_H__ 2 | #define __GASAL_INTERFACES_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Resizer for the whole gpu_storage in terms of number of sequences 9 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params); 10 | 11 | // operation filler method (field in the gasal_gpu_storage_t field) 12 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC); 13 | 14 | void gasal_set_device(int gpu_select = 0, bool isPrintingProp = true); 15 | #endif 16 | -------------------------------------------------------------------------------- /AGAThA/src/kernels/agatha_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef __AGATHA_KERNEL__ 2 | #define __AGATHA_KERNEL__ 3 | 4 | 5 | // This old core provides the same result as the currently LOCAL core, but lacks some optimization. Left for historical / comparative purposes. 6 | // Deprecated code from GASAL2 (left as reference) 7 | #define CORE_LOCAL_DEPRECATED_COMPUTE() \ 8 | uint32_t rbase = (packed_ref_literal >> l) & 15;/*get a base from target_batch sequence */ \ 9 | DEV_GET_SUB_SCORE_LOCAL(temp_score, qbase, rbase);/* check equality of qbase and rbase */ \ 10 | f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */ \ 11 | h[m] = p[m] + temp_score; /*score if qbase is aligned to rbase*/ \ 12 | h[m] = max(h[m], f[m]); \ 13 | h[m] = max(h[m], 0); \ 14 | e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence */\ 15 | h[m] = max(h[m], e); \ 16 | max_ref_idx = (max_score < h[m]) ? ref_idx + (m-1) : max_ref_idx; \ 17 | max_score = (max_score < h[m]) ? h[m] : max_score; \ 18 | p[m] = h[m-1]; 19 | 20 | #define CORE_COMPUTE() \ 21 | uint32_t rbase = (packed_ref_literal >> l) & 15;\ 22 | DEV_GET_SUB_SCORE_GLOBAL(temp_score, qbase, rbase) \ 23 | temp_score += p[m]; \ 24 | h[m] = max(temp_score, f[m]); \ 25 | h[m] = max(h[m], e); \ 26 | f[m] = max(temp_score- _cudaGapOE, f[m] - _cudaGapExtend); \ 27 | e = max(temp_score- _cudaGapOE, e - _cudaGapExtend); \ 28 | p[m] = h[m-1]; \ 29 | diag_idx = ((ref_idx + m-1+query_idx)&(total_shm-1))<<5;\ 30 | antidiag_max[real_warp_id+diag_idx] = max(antidiag_max[real_warp_id+diag_idx], (h[m]<<16) +ref_idx+ m-1);\ 31 | 32 | #define CORE_COMPUTE_BOUNDARY() \ 33 | if (query_idx + _cudaBandWidth < ref_idx + m-1 || query_idx - _cudaBandWidth > ref_idx + m-1) { \ 34 | p[m] = h[m-1]; \ 35 | } else { \ 36 | uint32_t rbase = (packed_ref_literal >> l) & 15;\ 37 | DEV_GET_SUB_SCORE_GLOBAL(temp_score, qbase, rbase) \ 38 | temp_score += p[m]; \ 39 | h[m] = max(temp_score, f[m]); \ 40 | h[m] = max(h[m], e); \ 41 | f[m] = max(temp_score- _cudaGapOE, f[m] - _cudaGapExtend); \ 42 | e = max(temp_score- _cudaGapOE, e - _cudaGapExtend); \ 43 | p[m] = h[m-1]; \ 44 | diag_idx = ((ref_idx + m-1+query_idx)&(total_shm-1))<<5;\ 45 | antidiag_max[real_warp_id+diag_idx] = max(antidiag_max[real_warp_id+diag_idx], (h[m]<<16) +ref_idx+ m-1);\ 46 | } 47 | 48 | 49 | __global__ void agatha_kernel(uint32_t *packed_query_batch, uint32_t *packed_ref_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, gasal_res_t *device_res_second, uint4 *packed_tb_matrices, int n_tasks, uint32_t max_query_len, short2 *global_buffer_top) 50 | { 51 | /*Initial kernel setup*/ 52 | 53 | // Initializing variables 54 | int32_t i, k, m, l, y, e; 55 | int32_t ub_idx, job_idx, ref_idx, query_idx; 56 | short2 HD; 57 | int32_t temp_score; 58 | int slice_start, slice_end, finished_blocks, chunk_start, chunk_end; 59 | int packed_ref_idx, packed_query_idx; 60 | int total_anti_diags; 61 | register uint32_t packed_ref_literal, packed_query_literal; 62 | bool active, terminated; 63 | int32_t packed_ref_batch_idx, packed_query_batch_idx, query_len, ref_len, packed_query_len, packed_ref_len; 64 | int diag_idx, temp, last_diag; 65 | 66 | // Initializing max score and its idx 67 | int32_t max_score = 0; 68 | int32_t max_ref_idx = 0; 69 | int32_t prev_max_score = 0; 70 | int32_t max_query_idx = 0; 71 | 72 | // Setting constant values 73 | const short2 initHD = make_short2(MINUS_INF2, MINUS_INF2); //used to initialize short2 74 | const int32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; //thread ID within the entire kernel 75 | const int packed_len = 8; //number of bps (literals) packed into a single int32 76 | const int const_warp_len = 8; //number of threads per subwarp (before subwarp rejoining occurs) 77 | const int real_warp_id = threadIdx.x % 32; //thread ID within a single (full 32-thread) warp 78 | const int warp_per_kernel = (gridDim.x * blockDim.x) / const_warp_len; // number of subwarps. assume number of threads % const_warp_len == 0 79 | const int job_per_warp = n_tasks % warp_per_kernel ? (n_tasks / warp_per_kernel + 1) : n_tasks / warp_per_kernel; //number of jobs (alignments/tasks) needed to be done by a single subwarp 80 | const int job_per_query = max_query_len % const_warp_len ? (max_query_len / const_warp_len + 1) : max_query_len / const_warp_len; //number of a literal's initial score to fill per thread 81 | const int job_start_idx = (tid / const_warp_len)*job_per_warp; // the boundary of jobs of a subwarp 82 | const int job_end_idx = (job_start_idx + job_per_warp) < n_tasks ? (job_start_idx + job_per_warp) : n_tasks; // the boundary of jobs of a subwarp 83 | const int total_shm = packed_len*(_cudaSliceWidth+1); // amount of shared memory a single thread uses 84 | 85 | // Arrays for saving intermediate values 86 | int32_t h[9]; 87 | int32_t f[9]; 88 | int32_t p[9]; 89 | 90 | // Global memory setup 91 | short2* global_buffer_left = (short2*)(global_buffer_top+max_query_len*(blockDim.x/8)*gridDim.x); 92 | int32_t* global_buffer_topleft= (int32_t*)(global_buffer_left+max_query_len*(blockDim.x/8)*gridDim.x); 93 | short2* global_ub_idx = (short2*)(global_buffer_top+max_query_len*(blockDim.x/8)*gridDim.x*3); 94 | 95 | // Shared memory setup 96 | extern __shared__ int32_t shared_maxHH[]; 97 | int32_t* antidiag_max = (int32_t*)(shared_maxHH+(threadIdx.x/32)*total_shm*32); 98 | int32_t* shared_job = shared_maxHH+(blockDim.x/32)*total_shm*32+(threadIdx.x/32)*28; 99 | 100 | /* Setup values that will change after Subwarp Rejoining */ 101 | int warp_len = const_warp_len; 102 | int warp_id = threadIdx.x % warp_len; // id of a thread in a subwarp 103 | int warp_num = tid / warp_len; 104 | // mask that is true for threads in the same subwarp 105 | unsigned same_threads = __match_any_sync(0xffffffff, warp_num); 106 | if (warp_id==0) shared_job[(warp_num&3)] = -1; 107 | 108 | /* Iterating over jobs/alignments */ 109 | for (job_idx = job_start_idx; job_idx < job_end_idx; job_idx++) { 110 | 111 | /*Uneven Bucketing*/ 112 | // the first subwarp fetches a long sequence's idx, while the remaining subwarps fetch short sequences' idx 113 | ub_idx = ((job_idx&3)==0)? global_ub_idx[n_tasks-(job_idx>>2)-1].y: global_ub_idx[job_idx-(job_idx>>2)-1].y; 114 | 115 | // get target and query sequence information 116 | packed_ref_batch_idx = target_batch_offsets[ub_idx] >> 3; //starting index of the target_batch sequence 117 | packed_query_batch_idx = query_batch_offsets[ub_idx] >> 3;//starting index of the query_batch sequence 118 | query_len = query_batch_lens[ub_idx]; // query sequence length 119 | ref_len = target_batch_lens[ub_idx]; // reference sequence length 120 | packed_query_len = (query_len >> 3) + (query_len & 7 ? 1 : 0);//number of 32-bit words holding query_batch sequence 121 | packed_ref_len = (ref_len >> 3) + (ref_len & 7 ? 1 : 0);//number of 32-bit words holding target_batch sequence 122 | 123 | /*Buffer Initialization*/ 124 | // fill global buffer with initial value 125 | // global_buffer_top: used to store intermediate scores H and E in the horizontal strip (scores from the top) 126 | for (i = 0; i < job_per_query; i++) { 127 | l = i*warp_len + warp_id; 128 | if ((l) < max_query_len) { 129 | k = -(_cudaGapOE + (_cudaGapExtend*(l))); 130 | global_buffer_top[warp_num*max_query_len + l] = l <= _cudaBandWidth? make_short2(k, k-_cudaGapOE):initHD; 131 | } 132 | } 133 | // global_buffer_left: used to store intermediate scores H and F in the vertical strip (scores from the left) 134 | for (i = 0; i < job_per_query; i++) { 135 | l = i*warp_len + warp_id; 136 | if ((l) < max_query_len) { 137 | k = -(_cudaGapOE + (_cudaGapExtend*(l))); 138 | global_buffer_left[warp_num*max_query_len + l] = l <= _cudaBandWidth? make_short2(k, k-_cudaGapOE):initHD; 139 | } 140 | } 141 | // global_buffer_topleft: used to store intermediate scores H in the diagonal strip (scores from the top-left) 142 | for (i = 0; i < job_per_query; i++) { 143 | l = i*warp_len + warp_id; 144 | if (l < max_query_len) { 145 | k = -(_cudaGapOE+(_cudaGapExtend*(l*packed_len-1))); 146 | global_buffer_topleft[warp_num*max_query_len + l] = l==0? 0: (l*packed_len-1) <= _cudaBandWidth? k: MINUS_INF2; 147 | } 148 | } 149 | 150 | // fill shared memory with initial value 151 | for (m = 0; m < total_shm; m++) { 152 | antidiag_max[real_warp_id + m*32] = INT_MIN; 153 | } 154 | 155 | __syncwarp(); 156 | 157 | // Initialize variables 158 | max_score = 0; 159 | prev_max_score = 0; 160 | max_ref_idx = 0; 161 | max_query_idx = 0; 162 | terminated = false; 163 | 164 | i = 0; //chunk 165 | total_anti_diags = packed_ref_len + packed_query_len-1; //chunk 166 | 167 | /*Subwarp Rejoining*/ 168 | //set shared memory that is used to maintain values for subwarp rejoining 169 | if (warp_id==0) shared_job[(warp_num&3)] = total_anti_diags; 170 | else if (warp_id==1) shared_job[4+(warp_num&3)] = packed_ref_batch_idx; 171 | else if (warp_id==2) shared_job[8+(warp_num&3)] = packed_query_batch_idx; 172 | else if (warp_id==3) shared_job[12+(warp_num&3)] = (ref_len<<16)+query_len; 173 | else if (warp_id==4) shared_job[16+(warp_num&3)] = ub_idx; 174 | 175 | same_threads = __match_any_sync(__activemask(), warp_num); 176 | 177 | __syncwarp(); 178 | 179 | /*Main Alignment Loop*/ 180 | while (i < total_anti_diags) { 181 | 182 | // set boundaries for current slice 183 | slice_start = max(0, (i-packed_query_len+1)); 184 | slice_start = max(slice_start, (i*packed_len + packed_len-1+1 - _cudaBandWidth)/2/packed_len); 185 | slice_end = min(packed_ref_len-1, i+_cudaSliceWidth-1); 186 | slice_end = min(slice_end, ((i+_cudaSliceWidth-1)*packed_len + packed_len-1 + _cudaBandWidth)/2/packed_len); 187 | finished_blocks = slice_start; 188 | 189 | if (slice_start > slice_end) { 190 | terminated = true; 191 | } 192 | 193 | while (!terminated && finished_blocks <= slice_end) { 194 | // while the entire chunk diag is not finished 195 | packed_ref_idx = finished_blocks + warp_id; 196 | packed_query_idx = i - packed_ref_idx; 197 | active = (packed_ref_idx <= slice_end); //whether the current thread has cells to fill or not 198 | 199 | if (active) { 200 | ref_idx = packed_ref_idx << 3; 201 | query_idx = packed_query_idx << 3; 202 | 203 | // load intermediate values from global buffers 204 | p[1] = global_buffer_topleft[warp_num*max_query_len + packed_ref_idx]; 205 | 206 | for (m = 1; m < 9; m++) { 207 | if ( (ref_idx + m-1) < ref_len) { 208 | HD = global_buffer_left[warp_num*max_query_len + ref_idx + m-1]; 209 | h[m] = HD.x; 210 | f[m] = HD.y; 211 | } else { 212 | // if index out of bound of the score table 213 | h[m] = MINUS_INF2; 214 | f[m] = MINUS_INF2; 215 | } 216 | 217 | } 218 | 219 | for (m=2;m<9;m++) { 220 | p[m] = h[m-1]; 221 | } 222 | 223 | // Set boundaries for the current chunk 224 | chunk_start = (max(0, (packed_ref_idx*packed_len - _cudaBandWidth)))/packed_len; 225 | chunk_end = min( packed_query_len-1, ( (packed_ref_idx*packed_len + packed_len -1 + _cudaBandWidth )) /packed_len ); 226 | packed_ref_literal = packed_ref_batch[packed_ref_batch_idx + packed_ref_idx]; 227 | } 228 | 229 | // Compute the current chunk 230 | for (y = 0; y < _cudaSliceWidth; y++) { 231 | if (active && chunk_start <= packed_query_idx && packed_query_idx <= chunk_end) { 232 | 233 | packed_query_literal = packed_query_batch[packed_query_batch_idx + packed_query_idx]; 234 | query_idx = packed_query_idx << 3; 235 | 236 | for (k = 28; k >= 0 && query_idx < query_len; k -= 4) { 237 | uint32_t qbase = (packed_query_literal >> k) & 15; //get a base from query_batch sequence 238 | // load intermediate values from global buffers 239 | HD = global_buffer_top[warp_num*max_query_len + query_idx]; 240 | h[0] = HD.x; 241 | e = HD.y; 242 | 243 | if (packed_query_idx == chunk_start || packed_query_idx == chunk_end) { 244 | #pragma unroll 8 245 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 246 | CORE_COMPUTE_BOUNDARY(); 247 | } 248 | } else { 249 | #pragma unroll 8 250 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 251 | CORE_COMPUTE(); 252 | } 253 | } 254 | 255 | // write intermediate values to global buffers 256 | HD.x = h[m-1]; 257 | HD.y = e; 258 | global_buffer_top[warp_num*max_query_len + query_idx] = HD; 259 | 260 | query_idx++; 261 | 262 | } 263 | 264 | } 265 | 266 | 267 | packed_query_idx++; 268 | 269 | } 270 | 271 | // write intermediate values to global buffers 272 | if (active) { 273 | for (m = 1; m < 9; m++) { 274 | if ( ref_idx + m-1 < ref_len) { 275 | HD.x = h[m]; 276 | HD.y = f[m]; 277 | global_buffer_left[warp_num*max_query_len + ref_idx + m-1] = HD; 278 | } 279 | } 280 | global_buffer_topleft[warp_num*max_query_len + packed_ref_idx] = p[1]; 281 | } 282 | 283 | finished_blocks+=warp_len; 284 | } 285 | 286 | __syncwarp(); 287 | 288 | last_diag = (i+_cudaSliceWidth)<<3; 289 | prev_max_score = query_len+ref_len-1; 290 | 291 | /* Termination Condition & Score Update */ 292 | if (!terminated) { 293 | for (diag_idx = i<<3; diag_idx < last_diag; diag_idx++) { 294 | if (diag_idx >16) > max_score) { 298 | max_score = temp>>16; 299 | max_ref_idx = (temp&65535); 300 | max_query_idx = diag_idx-max_ref_idx; 301 | } else if ( (temp&65535) >= max_ref_idx && (diag_idx-(temp&65535)) >= max_query_idx) { 302 | int tl = (temp&65535) - max_ref_idx, ql = (diag_idx-(temp&65535)) - max_query_idx, l; 303 | l = tl > ql? tl - ql : ql - tl; 304 | if (_cudaZThreshold >= 0 && max_score - (temp>>16) > _cudaZThreshold + l*_cudaGapExtend) { 305 | // Termination condition is met 306 | terminated = true; 307 | break; 308 | } 309 | } 310 | // reset shared memory buffer for next slice 311 | antidiag_max[(m<<5)+real_warp_id]=INT_MIN; 312 | } 313 | } 314 | } 315 | 316 | __syncwarp(); 317 | 318 | // If job is finished 319 | if (terminated) { 320 | total_anti_diags = i; // set the total amount of diagonals as the current diagonal (to indicate that the job has finished) 321 | if (warp_id==0) shared_job[(warp_num&3)] = total_anti_diags; //update this to shared memory as well (this will be used in Subwarp Rejoining as an indicator that the subwarp's job is done) 322 | } 323 | 324 | // Update the max score and its index to shared memory (used in Subwarp Rejoining) 325 | if (warp_id==1) shared_job[20+(warp_num&3)] = max_score; 326 | else if (warp_id==2) shared_job[24+(warp_num&3)] = (max_ref_idx<<16) + max_query_idx; 327 | 328 | __syncwarp(); 329 | 330 | i += _cudaSliceWidth; 331 | 332 | /*Job wrap-up*/ 333 | // If the job is done (either due to (1) meeting the termination condition (2) all the diagonals have been computed) 334 | if (i >= total_anti_diags) { 335 | 336 | // In the case of (2), check the termination condition & score update for the last diagonal block 337 | if (!terminated) { 338 | diag_idx = (i*packed_len)&(total_shm-1); 339 | for (k = i*packed_len, m = diag_idx; m < diag_idx+packed_len; m++, k++) { 340 | temp = __reduce_max_sync(same_threads, antidiag_max[(m<<5)+real_warp_id]); 341 | if ((temp>>16) > max_score) { 342 | max_score = temp>>16; 343 | max_ref_idx = (temp&65535); 344 | max_query_idx = k-max_ref_idx; 345 | } else if ( (temp&65535) >= max_ref_idx && (k-(temp&65535)) >= max_query_idx) { 346 | int tl = (temp&65535) - max_ref_idx, ql = (k-(temp&65535)) - max_query_idx, l; 347 | l = tl > ql? tl - ql : ql - tl; 348 | if (_cudaZThreshold >= 0 && max_score - (temp>>16) > _cudaZThreshold + l*_cudaGapExtend) { 349 | // Termination condition is met 350 | terminated = true; 351 | break; 352 | } 353 | } 354 | antidiag_max[(m<<5)+real_warp_id]=INT_MIN; 355 | } 356 | } 357 | 358 | // Spill the results to GPU memory to be later moved to the CPU 359 | if (warp_id==0) { 360 | device_res->aln_score[ub_idx] = max_score;//copy the max score to the output array in the GPU mem 361 | device_res->query_batch_end[ub_idx] = max_query_idx;//copy the end position on query_batch sequence to the output array in the GPU mem 362 | device_res->target_batch_end[ub_idx] = max_ref_idx;//copy the end position on target_batch sequence to the output array in the GPU mem 363 | } 364 | 365 | /*Subwarp Rejoining*/ 366 | // The subwarp that has no job looks for new jobs by iterating over other subwarp's job 367 | for (m = 0; m < (32/const_warp_len); m++) { 368 | // if the selected job still has remainig diagonals 369 | if (shared_job[m] > i) { // possible because all subwarps sync after each diagonal block is finished 370 | // read the selected job's info 371 | total_anti_diags = shared_job[m]; 372 | warp_num = ((warp_num>>2)<<2)+m; 373 | ub_idx = shared_job[16+m]; 374 | 375 | packed_ref_batch_idx = shared_job[4+m]; 376 | packed_query_batch_idx = shared_job[8+m]; 377 | ref_len = shared_job[12+m]; 378 | query_len = ref_len&65535; 379 | ref_len = ref_len>>16; 380 | packed_query_len = (query_len >> 3) + (query_len & 7 ? 1 : 0); 381 | packed_ref_len = (ref_len >> 3) + (ref_len & 7 ? 1 : 0); 382 | 383 | max_score = shared_job[20+m]; 384 | max_ref_idx = shared_job[24+m]; 385 | max_query_idx = max_ref_idx&65535; 386 | max_ref_idx = max_ref_idx>>16; 387 | 388 | // reset the flag 389 | terminated = false; 390 | 391 | // reset shared memory buffer 392 | for (m = 0; m < total_shm; m++) { 393 | antidiag_max[(m<<5)+real_warp_id]=INT_MIN; 394 | } 395 | 396 | break; 397 | } 398 | } 399 | 400 | } 401 | 402 | __syncwarp(); 403 | 404 | /*Subwarp Rejoining*/ 405 | //Set the mask, warp length and thread id within the warp 406 | same_threads = __match_any_sync(__activemask(), warp_num); 407 | warp_len = __popc(same_threads); 408 | warp_id = __popc((((0xffffffff) << (threadIdx.x % 32))&same_threads))-1; 409 | 410 | __syncwarp(); 411 | 412 | } 413 | 414 | __syncwarp(); 415 | /*Subwarp Rejoining*/ 416 | //Reset subwarp and job related values for the next iteration 417 | warp_len = const_warp_len; 418 | warp_num = tid / warp_len; 419 | warp_id = tid % const_warp_len; 420 | ub_idx = shared_job[16+(warp_num&3)]; 421 | 422 | __syncwarp(); 423 | 424 | 425 | 426 | } 427 | 428 | return; 429 | 430 | 431 | } 432 | 433 | 434 | __global__ void agatha_sort(uint32_t *packed_query_batch, uint32_t *packed_ref_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, int n_tasks, uint32_t max_query_len, short2 *global_buffer_top) 435 | { 436 | 437 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 438 | 439 | uint32_t query_len, ref_len, packed_query_len, packed_ref_len; 440 | 441 | short2* global_ub_idx = (short2*)(global_buffer_top+max_query_len*(blockDim.x/8)*gridDim.x*3); 442 | 443 | if (tid < n_tasks) { 444 | 445 | query_len = query_batch_lens[tid]; 446 | ref_len = target_batch_lens[tid]; 447 | packed_query_len = (query_len >> 3) + (query_len & 7 ? 1 : 0);//number of 32-bit words holding query_batch sequence 448 | packed_ref_len = (ref_len >> 3) + (ref_len & 7 ? 1 : 0); 449 | 450 | global_ub_idx[tid] = make_short2((packed_ref_len + packed_query_len-1), tid); 451 | 452 | 453 | } 454 | 455 | return; 456 | 457 | 458 | } 459 | #endif 460 | -------------------------------------------------------------------------------- /AGAThA/src/kernels/pack_rc_seqs.h: -------------------------------------------------------------------------------- 1 | #ifndef __KERNEL_SEQPAK__ 2 | #define __KERNEL_SEQPAK__ 3 | 4 | 5 | #define A_PAK ('A'&0x0F) 6 | #define C_PAK ('C'&0x0F) 7 | #define G_PAK ('G'&0x0F) 8 | #define T_PAK ('T'&0x0F) 9 | //#define N_PAK ('N'&0x0F) 10 | 11 | 12 | 13 | __global__ void gasal_pack_kernel(uint32_t* unpacked_query_batch, uint32_t* unpacked_target_batch, uint32_t *packed_query_batch, uint32_t* packed_target_batch, int query_batch_tasks_per_thread, int target_batch_tasks_per_thread, uint32_t total_query_batch_regs, uint32_t total_target_batch_regs) \ 14 | { 15 | 16 | int32_t i; 17 | const int32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 18 | uint32_t n_threads = gridDim.x * blockDim.x; 19 | for (i = 0; i < query_batch_tasks_per_thread && (((i*n_threads)<<1) + (tid<<1) < total_query_batch_regs); ++i) { 20 | uint32_t *query_addr = &(unpacked_query_batch[(i*n_threads)<<1]); 21 | uint32_t reg1 = query_addr[(tid << 1)]; //load 4 bases of the query sequence from global memory 22 | uint32_t reg2 = query_addr[(tid << 1) + 1]; //load another 4 bases 23 | uint32_t packed_reg = 0; 24 | packed_reg |= (reg1 & 15) << 28; // --- 25 | packed_reg |= ((reg1 >> 8) & 15) << 24; // | 26 | packed_reg |= ((reg1 >> 16) & 15) << 20;// | 27 | packed_reg |= ((reg1 >> 24) & 15) << 16;// | 28 | packed_reg |= (reg2 & 15) << 12; // > pack sequence 29 | packed_reg |= ((reg2 >> 8) & 15) << 8; // | 30 | packed_reg |= ((reg2 >> 16) & 15) << 4; // | 31 | packed_reg |= ((reg2 >> 24) & 15); //---- 32 | uint32_t *packed_query_addr = &(packed_query_batch[i*n_threads]); 33 | packed_query_addr[tid] = packed_reg; //write 8 bases of packed query sequence to global memory 34 | } 35 | 36 | for (i = 0; i < target_batch_tasks_per_thread && (((i*n_threads)<<1) + (tid<<1)) < total_target_batch_regs; ++i) { 37 | uint32_t *target_addr = &(unpacked_target_batch[(i * n_threads)<<1]); 38 | uint32_t reg1 = target_addr[(tid << 1)]; //load 4 bases of the target sequence from global memory 39 | uint32_t reg2 = target_addr[(tid << 1) + 1]; //load another 4 bases 40 | uint32_t packed_reg = 0; 41 | packed_reg |= (reg1 & 15) << 28; // --- 42 | packed_reg |= ((reg1 >> 8) & 15) << 24; // | 43 | packed_reg |= ((reg1 >> 16) & 15) << 20;// | 44 | packed_reg |= ((reg1 >> 24) & 15) << 16;// | 45 | packed_reg |= (reg2 & 15) << 12; // > pack sequence 46 | packed_reg |= ((reg2 >> 8) & 15) << 8; // | 47 | packed_reg |= ((reg2 >> 16) & 15) << 4; // | 48 | packed_reg |= ((reg2 >> 24) & 15); //---- 49 | uint32_t *packed_target_addr = &(packed_target_batch[i * n_threads]); 50 | packed_target_addr[tid] = packed_reg; //write 8 bases of packed target sequence to global memory 51 | } 52 | 53 | } 54 | 55 | 56 | __global__ void gasal_reversecomplement_kernel(uint32_t *packed_query_batch,uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, uint8_t *query_op, uint8_t *target_op, uint32_t n_tasks) 57 | { 58 | 59 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 60 | 61 | if (tid >= n_tasks) return; 62 | if (query_op[tid] == 0 && target_op[tid] == 0) return; // if there's nothing to do (op=0, meaning sequence is Forward Natural), just exit the kernel ASAP. 63 | 64 | 65 | uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence 66 | uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence 67 | uint32_t read_len = query_batch_lens[tid]; 68 | uint32_t ref_len = target_batch_lens[tid]; 69 | uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch 70 | uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch 71 | 72 | uint32_t query_batch_regs_to_swap = (query_batch_regs >> 1) + (query_batch_regs & 1); // that's (query_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence) 73 | uint32_t target_batch_regs_to_swap = (target_batch_regs >> 1) + (target_batch_regs & 1); // that's (target_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence) 74 | 75 | 76 | // variables used dependent on target and query: 77 | 78 | uint8_t *op = NULL; 79 | uint32_t *packed_batch = NULL; 80 | uint32_t *batch_regs = NULL; 81 | uint32_t *batch_regs_to_swap = NULL; 82 | uint32_t *packed_batch_idx = NULL; 83 | 84 | // avoid useless code duplicate thanks to pointers to route the data flow where it should be, twice. 85 | // The kernel is already generic. Later on this can be used to split the kernel into two using templates... 86 | #pragma unroll 2 87 | for (int p = QUERY; p <= TARGET; p++) 88 | { 89 | switch(p) 90 | { 91 | case QUERY: 92 | op = query_op; 93 | packed_batch = packed_query_batch; 94 | batch_regs = &query_batch_regs; 95 | batch_regs_to_swap = &query_batch_regs_to_swap; 96 | packed_batch_idx = &packed_query_batch_idx; 97 | break; 98 | case TARGET: 99 | op = target_op; 100 | packed_batch = packed_target_batch; 101 | batch_regs = &target_batch_regs; 102 | batch_regs_to_swap = &target_batch_regs_to_swap; 103 | packed_batch_idx = &packed_target_batch_idx; 104 | break; 105 | default: 106 | break; 107 | } 108 | 109 | if (*(op + tid) & 0x01) // reverse 110 | { 111 | // deal with N's : read last word, find how many N's, store that number as offset, and pad with that many for the last 112 | uint8_t nbr_N = 0; 113 | for (int j = 0; j < 32; j = j + 4) 114 | { 115 | nbr_N += (((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1) & (0x0F << j)) >> j) == N_CODE); 116 | } 117 | 118 | //printf("KERNEL_DEBUG: nbr_N=%d\n", nbr_N); 119 | 120 | 121 | nbr_N = nbr_N << 2; // we operate on nibbles so we will need to do our shifts 4 bits by 4 bits, so 4*nbr_N 122 | 123 | for (uint32_t i = 0; i < *(batch_regs_to_swap); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if. 124 | { 125 | /* This is the current operation flow:\ 126 | - Read the first 32-bits word on HEAD 127 | - Combine the reads of 2 last 32-bits words on tail to create the 32-bits word WITHOUT N's 128 | - Swap them 129 | - Write them at the correct places. Remember we're building 32-bits words across two 32-bits words on tail. 130 | So we have to take care of which bits are to be written on tail, too. 131 | 132 | You progress through both heads and tails that way, until you reach the center of the sequence. 133 | When you reach it, you actually don't write one of the words to avoid overwrite. 134 | */ 135 | uint32_t rpac_1 = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head 136 | uint32_t rpac_2 = ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) << (32-nbr_N)) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) >> nbr_N); 137 | 138 | 139 | uint32_t reverse_rpac_1 = 0; 140 | uint32_t reverse_rpac_2 = 0; 141 | 142 | 143 | #pragma unroll 8 144 | for(int k = 28; k >= 0; k = k - 4) // reverse 32-bits word... is pragma-unrolled. 145 | { 146 | reverse_rpac_1 |= ((rpac_1 & (0x0F << k)) >> (k)) << (28-k); 147 | reverse_rpac_2 |= ((rpac_2 & (0x0F << k)) >> (k)) << (28-k); 148 | } 149 | // last swap operated manually, because of its irregular size (32 - 4*nbr_N bits, hence 8 - nbr_N nibbles) 150 | 151 | 152 | uint32_t to_queue_1 = (reverse_rpac_1 << nbr_N) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) & ((1<> (32-nbr_N)); 154 | 155 | 156 | //printf("KERNEL DEBUG: rpac_1 Word before reverse: %x, after: %x, split into %x + %x \n", rpac_1, reverse_rpac_1, to_queue_2, to_queue_1 ); 157 | //printf("KERNEL DEBUG: rpac_2 Word before reverse: %x, after: %x\n", rpac_2, reverse_rpac_2 ); 158 | 159 | 160 | *(packed_batch + *(packed_batch_idx) + i) = reverse_rpac_2; 161 | (*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) = to_queue_1; 162 | if (i!=*(batch_regs_to_swap)-1) 163 | (*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) = to_queue_2; 164 | 165 | 166 | } // end for 167 | } // end if(reverse) 168 | 169 | if (*(op+tid) & 0x02) // complement 170 | { 171 | for (uint32_t i = 0; i < *(batch_regs); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if. 172 | { 173 | uint32_t rpac = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head 174 | uint32_t nucleotide = 0; 175 | 176 | #pragma unroll 8 177 | for(int k = 28; k >= 0; k = k - 4) // complement 32-bits word... is pragma-unrolled. 178 | { 179 | nucleotide = (rpac & (0x0F << k)) >> (k); 180 | switch(nucleotide) 181 | { 182 | case A_PAK: 183 | nucleotide = T_PAK; 184 | break; 185 | case C_PAK: 186 | nucleotide = G_PAK; 187 | break; 188 | case T_PAK: 189 | nucleotide = A_PAK; 190 | break; 191 | case G_PAK: 192 | nucleotide = C_PAK; 193 | break; 194 | default: 195 | break; 196 | } 197 | rpac = (rpac & (0xFFFFFFFF - (0x0F << k))) | nucleotide << k; 198 | } 199 | 200 | //printf("KERNEL DEBUG: Word read : %x, after complement: %x\n", *(packed_batch + *(packed_batch_idx) + i), rpac); 201 | 202 | *(packed_batch + *(packed_batch_idx) + i) = rpac; 203 | 204 | } // end for 205 | } // end if(complement) 206 | 207 | 208 | 209 | } 210 | 211 | return; 212 | } 213 | #endif -------------------------------------------------------------------------------- /AGAThA/src/res.cpp: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | 3 | #include "args_parser.h" 4 | 5 | #include "res.h" 6 | 7 | 8 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params) 9 | { 10 | cudaError_t err; 11 | gasal_res_t *res = NULL; 12 | 13 | 14 | res = (gasal_res_t *)malloc(sizeof(gasal_res_t)); 15 | 16 | CHECKCUDAERROR(cudaHostAlloc(&(res->aln_score), max_n_alns * sizeof(int32_t),cudaHostAllocDefault)); 17 | 18 | 19 | if(res ==NULL) 20 | { 21 | fprintf(stderr, "Malloc error on res host "); 22 | exit(1); 23 | } 24 | 25 | CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 26 | CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 27 | res->query_batch_start = NULL; 28 | res->target_batch_start = NULL; 29 | 30 | return res; 31 | } 32 | 33 | 34 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy) 35 | { 36 | cudaError_t err; 37 | 38 | 39 | 40 | // create class storage on device and copy top level class 41 | gasal_res_t *d_c; 42 | CHECKCUDAERROR(cudaMalloc((void **)&d_c, sizeof(gasal_res_t))); 43 | // CHECKCUDAERROR(cudaMemcpy(d_c, res, sizeof(gasal_res_t), cudaMemcpyHostToDevice)); 44 | 45 | 46 | 47 | // copy pointer to allocated device storage to device class 48 | CHECKCUDAERROR(cudaMemcpy(&(d_c->aln_score), &(device_cpy->aln_score), sizeof(int32_t*), cudaMemcpyHostToDevice)); 49 | CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_start), &(device_cpy->query_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice)); 50 | CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_start), &(device_cpy->target_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice)); 51 | CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_end), &(device_cpy->query_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice)); 52 | CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_end), &(device_cpy->target_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice)); 53 | 54 | 55 | 56 | 57 | 58 | return d_c; 59 | } 60 | 61 | 62 | 63 | 64 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params) 65 | { 66 | cudaError_t err; 67 | gasal_res_t *res; 68 | 69 | res = (gasal_res_t *)malloc(sizeof(gasal_res_t)); 70 | 71 | CHECKCUDAERROR(cudaMalloc(&(res->aln_score), max_n_alns * sizeof(int32_t))); 72 | 73 | CHECKCUDAERROR(cudaMalloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t))); 74 | CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t))); 75 | 76 | res->query_batch_start = NULL; 77 | res->target_batch_start = NULL; 78 | 79 | 80 | 81 | return res; 82 | } 83 | 84 | // TODO : make 2 destroys for host and device 85 | void gasal_res_destroy_host(gasal_res_t *res) 86 | { 87 | cudaError_t err; 88 | if (res == NULL) 89 | return; 90 | 91 | 92 | if (res->aln_score != NULL) CHECKCUDAERROR(cudaFreeHost(res->aln_score)); 93 | if (res->query_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_start)); 94 | if (res->target_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_start)); 95 | if (res->query_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_end)); 96 | if (res->target_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_end)); 97 | //if (res->n_cigar_ops != NULL) CHECKCUDAERROR(cudaFreeHost(res->n_cigar_ops)); 98 | 99 | free(res); 100 | } 101 | 102 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy) 103 | { 104 | cudaError_t err; 105 | if (device_cpy == NULL || device_res == NULL) 106 | return; 107 | 108 | if (device_cpy->aln_score != NULL) CHECKCUDAERROR(cudaFree(device_cpy->aln_score)); 109 | if (device_cpy->query_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_start)); 110 | if (device_cpy->target_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_start)); 111 | if (device_cpy->query_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_end)); 112 | if (device_cpy->target_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_end)); 113 | //if (device_cpy->cigar != NULL) CHECKCUDAERROR(cudaFree(device_cpy->cigar)); 114 | 115 | 116 | CHECKCUDAERROR(cudaFree(device_res)); 117 | 118 | free(device_cpy); 119 | } 120 | -------------------------------------------------------------------------------- /AGAThA/src/res.h: -------------------------------------------------------------------------------- 1 | #ifndef __RES_H__ 2 | #define __RES_H__ 3 | 4 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params); 5 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy); 6 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params); 7 | 8 | void gasal_res_destroy_host(gasal_res_t *res); 9 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy); 10 | 11 | 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /AGAThA/test_prog/Makefile: -------------------------------------------------------------------------------- 1 | #CUDA_LD_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib 2 | CUDA_LD_LIBRARY=/usr/local/cuda/lib64 3 | ANALYSIS_FILENAME=analysis 4 | # prefix1 can be optirun in case you need to run it from an optimus-enabled laptop. 5 | PREFIX1= 6 | #i prefix2 can be nvprof. use preferably the following : nvprof --profile-api-trace none -s -f -o /tmp/.nvprof/$(ANALYSIS_FILENAME).nvprof 7 | PREFIX2=nvprof --profile-api-trace none -s -f -o /tmp/.nvprof/$(ANALYSIS_FILENAME).nvprof 8 | #suffix1 and 2 can be an output file. 9 | SUFFIX1=> golden.log 10 | SUFFIX2=> out.log 11 | 12 | PRGM=manual 13 | 14 | OPTARGS1=-p -y local 15 | OPTARGS2=-p -y local 16 | 17 | 18 | FILES_HUMAN600=reads_600_human_10M.fasta ref_600_human_10M.fasta 19 | FILES_HUMAN300=reads_300_human_10M.fasta ref_300_human_10M.fasta 20 | FILES_HUMAN150=reads_150_human_10M.fasta ref_150_human_10M.fasta 21 | FILES_20K=query_batch.fasta target_batch.fasta 22 | FILES_262K=reads_150.fasta ref_150.fasta 23 | FILES_SHORT=short_query_batch.fasta short_target_batch.fasta 24 | 25 | .cpp.o: 26 | g++ -std=c++11 -g -c -O3 -Wall -Werror -fopenmp -I ../include -o test_prog.o test_prog.cpp -lcudart 27 | 28 | all: clean manual #test_prog.out 29 | 30 | 31 | manual: test_prog.o ../obj/args_parser.cppo ../obj/host_batch.cppo ../obj/ctors.cppo ../obj/interfaces.cppo ../obj/res.cppo ../obj/gasal_align.cuo 32 | g++ test_prog.o ../obj/args_parser.cppo ../obj/host_batch.cppo ../obj/ctors.cppo ../obj/interfaces.cppo ../obj/res.cppo ../obj/gasal_align.cuo -L../lib -o manual -g -fopenmp -std=c++11 -Wall -Wno-sign-compare -O3 -L/usr/local/cuda/lib64 -lcudart -Iinclude -I/usr/local/cuda/include -lgasal 33 | 34 | 35 | test_prog.out: test_prog.o 36 | g++ -std=c++11 -O3 test_prog.cpp -o test_prog.out -L$(CUDA_LD_LIBRARY) -L../lib -fopenmp -lcudart -lgasal 37 | #g++ -std=c++11 -O3 -c test_prog.cpp -o test_prog.out -L$(CUDA_LD_LIBRARY) -L../lib -fopenmp -lcuda -lcudart -lgasal test_prog.o 38 | 39 | clean: 40 | rm -f -r *~ *.exe *.o *.out manual 41 | 42 | test_prog.o: Timer.h 43 | 44 | 45 | human150: all 46 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN150) $(SUFFIX1) 47 | 48 | human150-2: all 49 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN150) $(SUFFIX2) 50 | 51 | human300: all 52 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN300) $(SUFFIX1) 53 | 54 | human300-2: all 55 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN300) $(SUFFIX2) 56 | 57 | human600: all 58 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN600) $(SUFFIX1) 59 | 60 | human600-2: all 61 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN600) $(SUFFIX2) 62 | 63 | 64 | run: all 65 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_SHORT) $(SUFFIX1) 66 | 67 | run2: all 68 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_SHORT) $(SUFFIX2) 69 | 70 | 71 | fullrun: all 72 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1) 73 | 74 | fullrun2: all 75 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_20K) $(SUFFIX2) 76 | 77 | 78 | 262k: all 79 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_262K) $(SUFFIX1) 80 | 81 | 262k2: all 82 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_262K) $(SUFFIX2) 83 | 84 | 85 | 86 | 87 | 88 | cuda-memcheck: all 89 | cuda-memcheck ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1) 90 | 91 | cuda-gdb: all 92 | cuda-gdb --args ./test_prog.out -p -y local query_batch.fasta target_batch.fasta 93 | 94 | valgrind: all 95 | valgrind ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta 96 | 97 | gdb: all 98 | gdb --args ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta 99 | -------------------------------------------------------------------------------- /AGAThA/test_prog/README.md: -------------------------------------------------------------------------------- 1 | A test program to run the AGAThA kernel. 2 | TBA. -------------------------------------------------------------------------------- /AGAThA/test_prog/Timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class Timer 10 | { 11 | private: 12 | struct timeval startTime; 13 | struct timeval stopTime; 14 | double elapsedTime; 15 | std::string name; 16 | 17 | public: 18 | Timer(std::string n) { name = n; elapsedTime = 0.0;} 19 | Timer() { name = ""; elapsedTime = 0.0;} 20 | void Clear() { elapsedTime = 0.0; } 21 | void Start() { gettimeofday(&(startTime), NULL); } 22 | void Restart() 23 | { 24 | elapsedTime = 0.0; 25 | gettimeofday(&(startTime), NULL); 26 | } 27 | 28 | void Pause() 29 | { 30 | gettimeofday(&(stopTime), NULL); 31 | 32 | elapsedTime += ( (stopTime).tv_sec - (startTime).tv_sec) * 1000.0; // sec to ms 33 | elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0; // us to ms 34 | } 35 | 36 | void Stop() 37 | { 38 | gettimeofday(&(stopTime), NULL); 39 | 40 | elapsedTime = ( (stopTime).tv_sec - (startTime).tv_sec) * 1000.0; // sec to ms 41 | elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0; // us to ms 42 | } 43 | 44 | void Print() 45 | { 46 | std::cout << name << " : " << elapsedTime << " msec" << std::endl; 47 | } 48 | 49 | double GetTime() { return elapsedTime;} 50 | 51 | }; 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /AGAThA/test_prog/test_prog.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include "../include/gasal_header.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "Timer.h" 9 | 10 | 11 | #define NB_STREAMS 2 12 | 13 | 14 | //#define DEBUG 15 | 16 | #define MAX(a,b) (a>b ? a : b) 17 | 18 | //#define GPU_SELECT 0 19 | 20 | 21 | int main(int argc, char **argv) { 22 | Timer local_time; 23 | Timer malloc_time; 24 | Timer free_time; 25 | cudaDeviceSynchronize(); 26 | Timer total_time; 27 | total_time.Start(); 28 | Timer load_time; 29 | load_time.Start(); 30 | 31 | //gasal_set_device(GPU_SELECT); 32 | 33 | Parameters *args; 34 | args = new Parameters(argc, argv); 35 | args->parse(); 36 | //args->print(); 37 | 38 | int print_out = args->print_out; 39 | int n_threads = args->n_threads; 40 | 41 | //--------------copy substitution scores to GPU-------------------- 42 | gasal_subst_scores sub_scores; 43 | 44 | sub_scores.match = args->sa; 45 | sub_scores.mismatch = args->sb; 46 | sub_scores.gap_open = args->gapo; 47 | sub_scores.gap_extend = args->gape; 48 | sub_scores.slice_width = args->slice_width; 49 | sub_scores.z_threshold = args->z_threshold; 50 | sub_scores.band_width = args->band_width; 51 | 52 | gasal_copy_subst_scores(&sub_scores); 53 | 54 | //------------------------------------------------------------------- 55 | 56 | 57 | std::vector query_seqs; 58 | std::vector target_seqs; 59 | std::vector query_headers; 60 | std::vector target_headers; 61 | std::string query_batch_line, target_batch_line; 62 | 63 | int total_seqs = 0; 64 | uint32_t maximum_sequence_length = 0; 65 | uint32_t target_seqs_len = 0; 66 | uint32_t query_seqs_len = 0; 67 | //std::cerr << "Loading files...." << std::endl; 68 | 69 | /* 70 | Reads FASTA files and fill the corresponding buffers. 71 | FASTA files contain sequences that are usually on separate lines. 72 | The file reader detects a '>' then concatenates all the following lines into one sequence, until the next '>' or EOF. 73 | See more about FASTA format : https://en.wikipedia.org/wiki/FASTA_format 74 | */ 75 | 76 | int seq_begin=0; 77 | 78 | std::vector query_mod; 79 | std::vector target_mod; 80 | std::vector query_id; 81 | std::vector target_id; 82 | 83 | char line_starts[5] = ">' translates to 0b00 (0) = Forward, natural 88 | * - '<' translates to 0b01 (1) = Reverse, natural 89 | * - '/' translates to 0b10 (2) = Forward, complemented 90 | * - '+' translates to 0b11 (3) = Reverse, complemented 91 | * No protection is done, so any other number will only have its two first bytes counted as above. 92 | */ 93 | 94 | while (getline(args->query_batch_fasta, query_batch_line) && getline(args->target_batch_fasta, target_batch_line)) { 95 | 96 | //load sequences from the files 97 | char *q = NULL; 98 | char *t = NULL; 99 | q = strchr(line_starts, (int) (query_batch_line[0])); 100 | t = strchr(line_starts, (int) (target_batch_line[0])); 101 | 102 | /* 103 | t and q are pointers to the first occurence of the first read character in the line_starts array. 104 | so if I compare the address of these pointers with the address of the pointer to line_start, then... 105 | I can get which character was found, so which modifier is required. 106 | */ 107 | 108 | if (q != NULL && t != NULL) { 109 | total_seqs++; 110 | 111 | query_mod.push_back((uint8_t) (q-line_starts)); 112 | query_id.push_back(total_seqs); 113 | 114 | target_mod.push_back((uint8_t)(t-line_starts)); 115 | target_id.push_back(total_seqs); 116 | 117 | query_headers.push_back(query_batch_line.substr(1)); 118 | target_headers.push_back(target_batch_line.substr(1)); 119 | 120 | if (seq_begin == 2) { 121 | // a sequence was already being read. Now it's done, so we should find its length. 122 | target_seqs_len += (target_seqs.back()).length(); 123 | query_seqs_len += (query_seqs.back()).length(); 124 | maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length); 125 | maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length); 126 | } 127 | seq_begin = 1; 128 | 129 | } else if (seq_begin == 1) { 130 | query_seqs.push_back(query_batch_line); 131 | target_seqs.push_back(target_batch_line); 132 | seq_begin=2; 133 | } else if (seq_begin == 2) { 134 | query_seqs.back() += query_batch_line; 135 | target_seqs.back() += target_batch_line; 136 | } else { // should never happen but always put an else, for safety... 137 | seq_begin = 0; 138 | std::cerr << "Batch1 and target_batch files should be fasta having same number of sequences" << std::endl; 139 | exit(EXIT_FAILURE); 140 | } 141 | } 142 | 143 | 144 | 145 | // Check maximum sequence length one more time, to check the last read sequence: 146 | target_seqs_len += (target_seqs.back()).length(); 147 | query_seqs_len += (query_seqs.back()).length(); 148 | maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length); 149 | maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length); 150 | int maximum_sequence_length_query = MAX((query_seqs.back()).length(), 0); 151 | 152 | #ifdef DEBUG 153 | std::cerr << "[TEST_PROG DEBUG]: "; 154 | std::cerr << "Size of read batches are: query=" << query_seqs_len << ", target=" << target_seqs_len << ". maximum_sequence_length=" << maximum_sequence_length << std::endl; 155 | #endif 156 | load_time.Stop(); 157 | 158 | Timer distr_time; 159 | distr_time.Start(); 160 | 161 | // transforming the _mod into a char* array (to be passed to GASAL, which deals with C types) 162 | uint8_t *target_seq_mod = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) ); 163 | uint8_t *query_seq_mod = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) ); 164 | uint32_t *target_seq_id = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) ); 165 | uint32_t *query_seq_id = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) ); 166 | 167 | for (int i = 0; i < total_seqs; i++) 168 | { 169 | query_seq_mod[i] = query_mod.at(i); 170 | query_seq_id[i] = query_id.at(i); 171 | } 172 | 173 | #ifdef DEBUG 174 | std::cerr << "[TEST_PROG DEBUG]: query, mod@id="; 175 | for (int i = 0; i < total_seqs; i++) 176 | { 177 | if ((query_seq_mod[i]) > 0) 178 | std::cerr << +(query_seq_mod[i]) << "@" << query_seq_id[i] << "| "; 179 | } 180 | 181 | std::cerr << std::endl; 182 | #endif 183 | 184 | for (int i = 0; i < total_seqs; i++) 185 | { 186 | target_seq_mod[i] = target_mod.at(i); 187 | target_seq_id[i] = target_id.at(i); 188 | } 189 | 190 | int *thread_seqs_idx = (int*)malloc(n_threads*sizeof(int)); 191 | int *thread_n_seqs = (int*)malloc(n_threads*sizeof(int)); 192 | int *thread_n_batchs = (int*)malloc(n_threads*sizeof(int)); 193 | double *thread_misc_time = (double*)calloc(n_threads, sizeof(double)); 194 | 195 | int thread_batch_size = (int)ceil((double)total_seqs/n_threads); 196 | int n_seqs_alloc = 0; 197 | for (int i = 0; i < n_threads; i++){//distribute the sequences among the threads equally 198 | thread_seqs_idx[i] = n_seqs_alloc; 199 | if (n_seqs_alloc + thread_batch_size < total_seqs) thread_n_seqs[i] = thread_batch_size; 200 | else thread_n_seqs[i] = total_seqs - n_seqs_alloc; 201 | thread_n_batchs[i] = (int)ceil((double)thread_n_seqs[i]/(args->kernel_align_num)); 202 | n_seqs_alloc += thread_n_seqs[i]; 203 | } 204 | distr_time.Stop(); 205 | 206 | //std::cerr << "Processing..." << std::endl; 207 | 208 | Timer process_time; 209 | process_time.Start(); 210 | omp_set_num_threads(n_threads); 211 | gasal_gpu_storage_v *gpu_storage_vecs = (gasal_gpu_storage_v*)calloc(n_threads, sizeof(gasal_gpu_storage_v)); 212 | for (int z = 0; z < n_threads; z++) { 213 | gpu_storage_vecs[z] = gasal_init_gpu_storage_v(NB_STREAMS);// creating NB_STREAMS streams per thread 214 | 215 | /* 216 | About memory sizes: 217 | The required memory is the total size of the batch + its padding, divided by the number of streams. 218 | The worst case would be that every sequence has to be padded with 7 'N', since they must have a length multiple of 8. 219 | Even though the memory can be dynamically expanded both for Host and Device, it is advised to start with a memory large enough so that these expansions rarely occur (for better performance.) 220 | Modifying the factor '1' in front of each size lets you see how GASAL2 expands the memory when needed. 221 | */ 222 | /* 223 | // For exemple, this is exactly the memory needed to allocate to fit all sequences is a single GPU BATCH. 224 | gasal_init_streams(&(gpu_storage_vecs[z]), 225 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 226 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 227 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 228 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 229 | ceil((double)target_seqs.size() / (double)(NB_STREAMS)), // maximum number of alignments is bigger on target than on query side. 230 | ceil((double)target_seqs.size() / (double)(NB_STREAMS)), 231 | args); 232 | */ 233 | //initializing the streams by allocating the required CPU and GPU memory 234 | // note: the calculations of the detailed sizes to allocate could be done on the library side (to hide it from the user's perspective) 235 | gasal_init_streams(&(gpu_storage_vecs[z]), (maximum_sequence_length_query + 7) , //TODO: remove maximum_sequence_length_query 236 | (maximum_sequence_length + 7) , 237 | maximum_sequence_length, 238 | args); 239 | } 240 | #ifdef DEBUG 241 | std::cerr << "[TEST_PROG DEBUG]: "; 242 | std::cerr << "size of host_unpack_query is " << (query_seqs_len +7*total_seqs) / (NB_STREAMS) << std::endl ; 243 | #endif 244 | 245 | #pragma omp parallel 246 | { 247 | int n_seqs = thread_n_seqs[omp_get_thread_num()];//number of sequences allocated to this thread 248 | int curr_idx = thread_seqs_idx[omp_get_thread_num()];//number of sequences allocated to this thread 249 | int seqs_done = 0; 250 | int n_batchs_done = 0; 251 | 252 | struct gpu_batch{ //a struct to hold data structures of a stream 253 | gasal_gpu_storage_t *gpu_storage; //the struct that holds the GASAL2 data structures 254 | int n_seqs_batch;//number of sequences in the batch (<= (target_seqs.size() / NB_STREAMS)) 255 | int batch_start;//starting index of batch 256 | }; 257 | 258 | #ifdef DEBUG 259 | std::cerr << "[TEST_PROG DEBUG]: "; 260 | std::cerr << "Number of gpu_batch in gpu_batch_arr : " << gpu_storage_vecs[omp_get_thread_num()].n << std::endl; 261 | std::cerr << "[TEST_PROG DEBUG]: "; 262 | std::cerr << "Number of gpu_storage_vecs in a gpu_batch : " << omp_get_thread_num()+1 << std::endl; 263 | #endif 264 | 265 | gpu_batch gpu_batch_arr[gpu_storage_vecs[omp_get_thread_num()].n]; 266 | 267 | for(int z = 0; z < gpu_storage_vecs[omp_get_thread_num()].n; z++) { 268 | gpu_batch_arr[z].gpu_storage = &(gpu_storage_vecs[omp_get_thread_num()].a[z]); 269 | 270 | } 271 | 272 | if (n_seqs > 0) { 273 | while (n_batchs_done < thread_n_batchs[omp_get_thread_num()]) { // Loop on streams 274 | int gpu_batch_arr_idx = 0; 275 | //------------checking the availability of a "free" stream"----------------- 276 | while(gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n && (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->is_free != 1) { 277 | gpu_batch_arr_idx++; 278 | } 279 | 280 | if (seqs_done < n_seqs && gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) { 281 | uint32_t query_batch_idx = 0; 282 | uint32_t target_batch_idx = 0; 283 | int j = 0; 284 | //-----------Create a batch of sequences to be aligned on the GPU. The batch contains (target_seqs.size() / NB_STREAMS) number of sequences----------------------- 285 | 286 | 287 | for (int i = curr_idx; seqs_done < n_seqs && j < (args->kernel_align_num); i++, j++, seqs_done++) 288 | { 289 | 290 | gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns++ ; 291 | 292 | if(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns > gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns) 293 | { 294 | gasal_host_alns_resize(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns * 2, args); 295 | } 296 | 297 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_offsets[j] = query_batch_idx; 298 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_offsets[j] = target_batch_idx; 299 | 300 | /* 301 | All the filling is moved on the library size, to take care of the memory size and expansions (when needed). 302 | The function gasal_host_batch_fill takes care of how to fill, how much to pad with 'N', and how to deal with memory. 303 | It's the same function for query and target, and you only need to set the final flag to either ; this avoides code duplication. 304 | The way the host memory is filled changes the current _idx (it's increased by size, and by the padding). That's why it's returned by the function. 305 | */ 306 | 307 | query_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 308 | query_batch_idx, 309 | query_seqs[i].c_str(), 310 | query_seqs[i].size(), 311 | QUERY); 312 | 313 | target_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 314 | target_batch_idx, 315 | target_seqs[i].c_str(), 316 | target_seqs[i].size(), 317 | TARGET); 318 | 319 | 320 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_lens[j] = query_seqs[i].size(); 321 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_lens[j] = target_seqs[i].size(); 322 | 323 | } 324 | 325 | #ifdef DEBUG 326 | std::cerr << "[TEST_PROG DEBUG]: "; 327 | std::cerr << "Stream " << gpu_batch_arr_idx << ": j = " << j << ", seqs_done = " << seqs_done <<", query_batch_idx=" << query_batch_idx << " , target_batch_idx=" << target_batch_idx << std::endl; 328 | #endif 329 | 330 | // Here, we fill the operations arrays for the current batch to be processed by the stream 331 | gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_seq_mod + seqs_done - j, j, QUERY); 332 | gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, target_seq_mod + seqs_done - j, j, TARGET); 333 | 334 | 335 | gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch = j; 336 | uint32_t query_batch_bytes = query_batch_idx; 337 | uint32_t target_batch_bytes = target_batch_idx; 338 | gpu_batch_arr[gpu_batch_arr_idx].batch_start = curr_idx; 339 | curr_idx += (args->kernel_align_num); 340 | 341 | //---------------------------------------------------------------------------------------------------- 342 | //-----------------calling the GASAL2 non-blocking alignment function--------------------------------- 343 | local_time.Start(); 344 | gasal_aln_async(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_batch_bytes, target_batch_bytes, gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch, args); 345 | local_time.Stop(); 346 | gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns = 0; 347 | //--------------------------------------------------------------------------------- 348 | 349 | } 350 | 351 | 352 | //-------------------------------print alignment results---------------------------------------- 353 | 354 | gpu_batch_arr_idx = 0; 355 | while (gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) {//loop through all the streams and print the results 356 | //of the finished streams. 357 | if (gasal_is_aln_async_done(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage) == 0) { 358 | int j = 0; 359 | if(print_out) { 360 | #pragma omp critical 361 | for (int i = gpu_batch_arr[gpu_batch_arr_idx].batch_start; j < gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch; i++, j++) { 362 | 363 | std::cout << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->aln_score[j] ; 364 | 365 | std::cout << "\tquery_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->query_batch_end[j]; 366 | std::cout << "\ttarget_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->target_batch_end[j] ; 367 | 368 | std::cout << std::endl; 369 | } 370 | } 371 | n_batchs_done++; 372 | } 373 | gpu_batch_arr_idx++; 374 | } 375 | } 376 | } 377 | 378 | 379 | } 380 | for (int z = 0; z < n_threads; z++) { 381 | gasal_destroy_streams(&(gpu_storage_vecs[z]), args); 382 | gasal_destroy_gpu_storage_v(&(gpu_storage_vecs[z])); 383 | } 384 | free(gpu_storage_vecs); 385 | process_time.Stop(); 386 | /* 387 | string algorithm = al_type; 388 | string start_type[2] = {"without_start", "with_start"}; 389 | al_type += "_"; 390 | al_type += start_type[start_pos==WITH_START]; 391 | */ 392 | double av_misc_time = 0.0; 393 | for (int i = 0; i < n_threads; ++i){ 394 | av_misc_time += (thread_misc_time[i]/n_threads); 395 | } 396 | //std::cerr << std::endl << "Done" << std::endl; 397 | //fprintf(stderr, "Total execution time (in milliseconds): %.3f\n", total_time.GetTime()); 398 | delete args; // closes the files 399 | //free(args); // closes the files 400 | total_time.Stop(); 401 | /* 402 | fprintf(stderr, "load time (in milliseconds): %.3f\n", load_time.GetTime()); 403 | fprintf(stderr, "distribution time (in milliseconds): %.3f\n", distr_time.GetTime()); 404 | fprintf(stderr, "process time (with malloc) (in milliseconds): %.3f\n", process_time.GetTime()); 405 | fprintf(stderr, "malloc time (in milliseconds): %.3f\n", malloc_time.GetTime()); 406 | fprintf(stderr, "free time (in milliseconds): %.3f\n", free_time.GetTime()); 407 | fprintf(stderr, "local kernel time (in milliseconds): %.3f\n", local_time.GetTime()); 408 | fprintf(stderr, "total time (in milliseconds): %.3f\n", total_time.GetTime()); 409 | */ 410 | } 411 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [PPoPP24] AGAThA: Fast and Efficient GPU Acceleration of Guided Sequence Alignment for Long Read Mapping [![DOI](https://zenodo.org/badge/725514536.svg)](https://zenodo.org/doi/10.5281/zenodo.10225634) 2 | 3 | ## !!! Important Notice !!! 4 | **This repository is currently undergoing a major update**. 5 | 6 | It is strongly recommended to **revisit this repository after a new release is made**. 7 | 8 | ## Getting Started 9 | 10 | ### 1. Environment Setup with Docker 11 | ```properties 12 | cd docker 13 | bash build.sh 14 | bash launch.sh 15 | ``` 16 | 17 | ### 2. Datasets & Building AGAThA 18 | A sample dataset can be found in `dataset/`. 19 | AGAThA can be built by running the following code: 20 | 21 | ```properties 22 | cd AGAThA 23 | bash build.sh 24 | cd .. 25 | ``` 26 | 27 | ## AGAThA Details 28 | 29 | AGAThA was built on top of [GASAL2](https://github.com/nahmedraja/GASAL2). 30 | 31 | ### 1. AGAThA Options 32 | Using ```AGAThA.sh```, we can use the following options for AGAThA. 33 | ``` 34 | -m the match score 35 | -x the mismatch penatly 36 | -q the gap open penalty 37 | -r the gap extension penalty 38 | -z the termination threshold 39 | -w the band width in the score table 40 | ``` 41 | ### 2. AGAThA Input 42 | AGAThA requires two datasets as input; 43 | * a fasta file with reference sequences labeled with sequence indices 44 | * a fasta file with query sequences labeled with sequences indices 45 | Both files should follow the format below: 46 | ``` 47 | >>> 1 48 | ATGCN... 49 | >>> 2 50 | TCGGA... 51 | ``` 52 | Fasta files can be downloaded from various sources such as [GenBank](https://www.ncbi.nlm.nih.gov/genbank/) or projects such as [Genome in a Bottle](https://www.nist.gov/programs-projects/genome-bottle). 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.7.1-devel-ubuntu22.04 2 | 3 | RUN apt-get update \ 4 | && apt-get install --assume-yes --no-install-recommends --quiet \ 5 | python3 \ 6 | python3-pip \ 7 | libz-dev \ 8 | wget 9 | 10 | RUN /bin/bash -c "source root/.bashrc" 11 | 12 | WORKDIR /agatha_ae -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | docker build -t agatha_ae . -------------------------------------------------------------------------------- /docker/launch.sh: -------------------------------------------------------------------------------- 1 | #docker run -it --rm --gpus all -v $PWD/../:/agatha_ae agatha_ae:latest /bin/bash 2 | docker run -it --rm --gpus all -v $PWD/../:/agatha_ae agatha_ae:latest /bin/bash -------------------------------------------------------------------------------- /misc/avg_time.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | 5 | # Input: process name | dataset id | raw_file | output_file | number of iterations 6 | 7 | process = sys.argv[1] 8 | dataset_id = sys.argv[2] 9 | raw_file = sys.argv[3] 10 | output_file = sys.argv[4] 11 | iter = int(sys.argv[5]) 12 | 13 | # Get average execution time 14 | if os.path.exists(raw_file): 15 | raw = open(raw_file, "r").read().splitlines() 16 | 17 | total_time = 0.0 18 | 19 | if len(raw) != 0: 20 | for r in raw: 21 | total_time += float(r) 22 | avg_time = total_time/float(iter) 23 | else: 24 | avg_time = "NaN" 25 | 26 | else: 27 | avg_time = "NaN" 28 | 29 | 30 | # Store result to json file 31 | 32 | if os.path.exists(output_file): 33 | with open(output_file, "r") as json_file: 34 | output = json.load(json_file) 35 | else: 36 | output = {} 37 | 38 | if process not in output: 39 | output[process] = {} 40 | 41 | output[process][dataset_id] = avg_time 42 | 43 | with open(output_file, "w") as json_file: 44 | json.dump(output, json_file) 45 | --------------------------------------------------------------------------------