├── .gitignore
├── AGAThA.sh
├── AGAThA
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── build.sh
    ├── configure.sh
    ├── src
    │   ├── Timer.h
    │   ├── args_parser.cpp
    │   ├── args_parser.h
    │   ├── ctors.cpp
    │   ├── ctors.h
    │   ├── gasal.h
    │   ├── gasal_align.cu
    │   ├── gasal_align.h
    │   ├── gasal_header.h
    │   ├── gasal_kernels.h
    │   ├── host_batch.cpp
    │   ├── host_batch.h
    │   ├── interfaces.cpp
    │   ├── interfaces.h
    │   ├── kernels
    │   │   ├── agatha_kernel.h
    │   │   └── pack_rc_seqs.h
    │   ├── res.cpp
    │   └── res.h
    └── test_prog
    │   ├── Makefile
    │   ├── README.md
    │   ├── Timer.h
    │   └── test_prog.cpp
├── README.md
├── dataset
    ├── query.fasta
    └── ref.fasta
├── docker
    ├── Dockerfile
    ├── build.sh
    └── launch.sh
└── misc
    └── avg_time.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | */lib/*
 2 | */include/*
 3 | */obj/*
 4 | .vscode/*
 5 | *.log
 6 | output/
 7 | 
 8 | *.o
 9 | *.cuo
10 | *.cppo
11 | *.out
12 | *.txt
13 | 
14 | *.json
15 | 
16 | #Temporary addition
17 | manual


--------------------------------------------------------------------------------
/AGAThA.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MAIN_DIR="/agatha_ae/"                  #the main directory
 3 | PROG_DIR=$MAIN_DIR"AGAThA/test_prog/"   #the directory where the test program is
 4 | OUTPUT_DIR=$MAIN_DIR"output/"           #the directory for the RAW_, FINAL_, SCORE_FILE
 5 | DATASET_DIR=$MAIN_DIR"dataset/"         #the directory where the input dataset is located in
 6 | FINAL_DIR=$PWD
 7 | 
 8 | RAW_FILE=$OUTPUT_DIR"raw.log"           #stores all kernel exec. time of all iterations        
 9 | FINAL_FILE=$OUTPUT_DIR"time.json"       #stores the average (total kernel exec. time of a single iteration)
10 | SCORE_FILE=$OUTPUT_DIR"score.log"       #stores the scores after alignment
11 | 
12 | ITER=1                                  #number of iteration of each program
13 | IDLE=5                                  #sleep between iterations
14 | DATASET_NAME="test"                     #the name for the current dataset (will be shown in FINAL_FILE)
15 | PROCESS="AGAThA"                        #the process name (will be shown in FINAL_FILE)
16 | 
17 | while getopts "i:" opt
18 | do
19 |     case "$opt" in
20 |     i ) ITER="$OPTARG" ;;
21 |     esac
22 | done
23 | 
24 | mkdir -p $OUTPUT_DIR                    #creating the output directory
25 | 
26 | echo ">>> Running $PROCESS for $ITER iterations."
27 | 
28 | if [ -f $RAW_FILE ]; then               #remove the output files before running the program
29 |     rm $RAW_FILE
30 | fi
31 | 
32 | if [ -f $SCORE_FILE ]; then
33 |     rm $SCORE_FILE
34 | fi
35 | 
36 | if [ -f $FINAL_FILE ]; then
37 |     rm $FINAL_FILE
38 | fi
39 | 
40 | iter=0                                  #start the main program
41 | while [ "$iter" -lt $ITER ] 
42 | do  
43 |     echo ">> Iteration $(($iter+1))"
44 |     ${PROG_DIR}manual -p -m 1 -x 4 -q 6 -r 2 -s 3 -z 400 -w 751 ${DATASET_DIR}ref.fasta ${DATASET_DIR}query.fasta ${RAW_FILE} > ${SCORE_FILE}
45 |     ((iter++))
46 |     sleep ${IDLE}s
47 | done
48 | 
49 | echo "$PROCESS complete."
50 | echo "Creating output files..."         #creating additional output files
51 | 
52 | python3 /agatha_ae/misc/avg_time.py $PROCESS $DATASET_NAME ${RAW_FILE} ${FINAL_FILE} $ITER 
53 | 
54 | echo "Complete."


--------------------------------------------------------------------------------
/AGAThA/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/AGAThA/Makefile:
--------------------------------------------------------------------------------
 1 | GPU_SM_ARCH=sm_86
 2 | #GPU_SM_ARCH=sm_75
 3 | MAX_QUERY_LEN=10000
 4 | N_CODE=0x4E
 5 | N_PENALTY=1
 6 | 
 7 | GPU_COMPUTE_ARCH=$(subst sm,compute,$(GPU_SM_ARCH))
 8 | NVCC=/usr/local/cuda/bin/nvcc
 9 | CC=g++
10 | SRC_DIR=./src/
11 | OBJ_DIR=./obj/
12 | LIB_DIR=./lib/
13 | INCLUDE_DIR=./include/
14 | 
15 | SOURCES=  args_parser.cpp host_batch.cpp ctors.cpp interfaces.cpp res.cpp gasal_align.cu 
16 | LOBJS=$(patsubst %,%o,$(SOURCES))
17 | 
18 | LOBJS_PATH=$(addprefix $(OBJ_DIR),$(LOBJS))
19 | VPATH=src:obj:lib
20 | YELLOW=\033[1;33m
21 | NC=\033[0m # No Color
22 | 
23 | ifeq ($(GPU_SM_ARCH),)
24 | error1:
25 | 	@echo "Must specify GPU architecture as sm_xx"
26 | endif
27 | ifeq ($(MAX_QUERY_LEN),)
28 | error2:
29 | 	@echo "Must specify maximum sequence length"
30 | endif
31 | 
32 | ifeq ($(N_CODE),)
33 | error3:
34 | 	@echo "Must specify the code for 'N'"
35 | endif
36 | #ifneq ($(GPU_SM_ARCH),clean)
37 | 
38 | 
39 | 
40 | 
41 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: --compiler-options -fPIC 
42 | ## With Debian and clang, use: $(NVCC) -ccbin clang-3.8 --compiler-options -fpie
43 | 
44 | ifeq ($(N_PENALTY),)
45 | %.cuo: %.cu
46 | 	$(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE) -Xptxas -Werror  --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@
47 | 	
48 | else
49 | %.cuo: %.cu
50 | 	$(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE),-DN_PENALTY=$(N_PENALTY) -Xptxas -Werror  --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@
51 | 	
52 | endif
53 | 
54 | 
55 | 
56 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: -fPIC 
57 | ifeq ($(N_PENALTY),)
58 | %.cppo: %.cpp
59 | 	$(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -Werror $< -o $(OBJ_DIR)$@
60 | 	
61 | else
62 | %.cppo: %.cpp
63 | 	$(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -DN_PENALTY=$(N_PENALTY) -Werror $< -o $(OBJ_DIR)$@
64 | 	
65 | endif
66 | 
67 | 
68 | all: clean makedir libgasal.a
69 | 
70 | makedir:
71 | 	@mkdir -p $(OBJ_DIR)
72 | 	@mkdir -p $(LIB_DIR)
73 | 	@mkdir -p $(INCLUDE_DIR)
74 | 	@cp $(SRC_DIR)/*.h $(INCLUDE_DIR)
75 | 	@sed  -i "s/MAX_QUERY_LEN=[0-9]\{1,9\}/MAX_QUERY_LEN=$(MAX_QUERY_LEN)/" ./test_prog/Makefile
76 | 	 
77 | ifeq ($(N_PENALTY),)
78 | libgasal.a: $(LOBJS)
79 | 	ar -csru $(LIB_DIR)$@ $(LOBJS_PATH)
80 | 	@echo ""
81 | 	@echo -e "${YELLOW}WARNING:${NC}\"N_PENALTY\" is not defined"
82 | else
83 | libgasal.a: $(LOBJS)
84 | 	ar -csru $(LIB_DIR)$@ $(LOBJS_PATH)
85 | endif
86 | 	
87 | clean:
88 | 	rm -f -r $(OBJ_DIR) $(LIB_DIR) $(INCLUDE_DIR)  *~ *.exe *.cppo *.cuo *.txt *~
89 | 
90 | gasal_align.cuo: gasal.h gasal_kernels.h
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/AGAThA/README.md:
--------------------------------------------------------------------------------
 1 | # AGAThA [![DOI](https://zenodo.org/badge/725514536.svg)](https://zenodo.org/doi/10.5281/zenodo.10225634)
 2 | 
 3 | AGAThA is built on top of [GASAL2](https://github.com/nahmedraja/GASAL2.git).
 4 | 
 5 | TBA
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/AGAThA/build.sh:
--------------------------------------------------------------------------------
1 | make
2 | cd test_prog
3 | make
4 | cd ..


--------------------------------------------------------------------------------
/AGAThA/configure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | cuda_path=$1
 5 | RED='\033[0;31m'
 6 | NC='\033[0m' # No Color
 7 | 
 8 | if [ "$cuda_path" = "" ]; then
 9 |   echo -e "${RED}Must provide path to CUDA installation directory${NC}"
10 |   echo -e "${RED}Configuration incomplete${NC}"
11 |   echo -e "${RED}Exiting${NC}"	
12 |   exit 1	
13 | fi	
14 | 
15 | cuda_nvcc_path=$cuda_path/bin/nvcc
16 | 
17 | if [ -f $cuda_nvcc_path ]; then
18 |  echo "NVCC found ($cuda_nvcc_path)"
19 | else
20 |   echo -e "${RED}NVCC not found${NC}"
21 |   echo -e "${RED}Configuration incomplete${NC}"
22 |   echo -e "${RED}Exiting${NC}"	
23 |   exit 1	
24 | fi	
25 | 
26 | 
27 | cuda_lib_path="${cuda_path}/targets/x86_64-linux/lib"
28 | 
29 | 
30 | if [ -d $cuda_lib_path ]; then
31 |  echo "CUDA runtime library found (${cuda_lib_path})"
32 | else
33 |   echo -e "${RED}CUDA runtime library not found${NC}" 
34 |   echo -e "${RED}Configuration incomplete${NC}"
35 |   echo -e "${RED}Exiting${NC}"
36 |   exit 1	
37 | fi
38 | 
39 | cuda_runtime_file="${cuda_path}/targets/x86_64-linux/include/cuda_runtime.h"
40 | 
41 | if [ -f $cuda_runtime_file ]; then
42 |  echo  "CUDA runtime header file found (${cuda_runtime_file})"
43 | else
44 |   echo -e "${RED}CUDA runtime header file not found${NC}"
45 |   echo -e "${RED}Configuration incomplete${NC}"
46 |   echo -e "${RED}Exiting${NC}"
47 |   exit 1	
48 | fi
49 | 
50 | 
51 | echo "Configuring Makefile..."
52 | 
53 | sed  -i "s,NVCC=.*,NVCC=$cuda_nvcc_path,g" Makefile 
54 | 
55 | echo "Configuring gasal.h..."
56 | 
57 | sed  -i "s,.*cuda_runtime\.h\",\#include \"$cuda_runtime_file\",g" ./src/gasal.h
58 | 
59 | echo "Configuring Makefile of test program..."
60 | 
61 | sed  -i "s,CUDA_LD_LIBRARY=.*,CUDA_LD_LIBRARY=$cuda_lib_path,g" ./test_prog/Makefile 
62 | 
63 | #mkdir -p include
64 | 
65 | #cp ./src/gasal.h ./include
66 | 
67 | echo "Done"
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/AGAThA/src/Timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_H
 2 | #define TIMER_H
 3 | 
 4 | #include<string.h>
 5 | #include<stdlib.h>
 6 | #include<assert.h>
 7 | #include<sys/time.h>
 8 | 
 9 | class Timer
10 | {
11 |     private:
12 |         struct timeval startTime;
13 |         struct timeval stopTime;
14 |         double elapsedTime;
15 |         std::string name;
16 | 
17 |     public:
18 |         Timer(std::string n) { name = n; elapsedTime = 0.0;}
19 | 	Timer() { name = ""; elapsedTime = 0.0;}
20 |         void Clear() { elapsedTime = 0.0; }
21 |         void Start() { gettimeofday(&(startTime), NULL); }
22 |         void Restart()
23 |         {
24 |             elapsedTime = 0.0;
25 |             gettimeofday(&(startTime), NULL);
26 |         }
27 | 
28 |         void Pause()
29 |         {
30 |             gettimeofday(&(stopTime), NULL);
31 | 
32 |             elapsedTime +=  ( (stopTime).tv_sec  - (startTime).tv_sec) * 1000.0;      // sec to ms
33 |             elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0;   // us to ms
34 |         }
35 | 
36 |         void Stop()
37 |         {
38 |             gettimeofday(&(stopTime), NULL);
39 | 
40 |             elapsedTime =  ( (stopTime).tv_sec  - (startTime).tv_sec) * 1000.0;      // sec to ms
41 |             elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0;   // us to ms
42 |         }
43 | 
44 |         void Print()
45 |         {
46 |             std::cout << name << " : " <<  elapsedTime << " msec"   << std::endl;
47 |         }
48 | 
49 |         double GetTime() { return elapsedTime;}
50 | 
51 | };
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/AGAThA/src/args_parser.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <iostream>
  3 | 
  4 | #include "args_parser.h"
  5 | 
  6 | 
  7 | 
  8 | Parameters::Parameters(int argc_, char **argv_) {
  9 | 
 10 | 
 11 |     // default values
 12 |     sa = (2);
 13 |     sb = (4);
 14 |     gapo = (4);
 15 |     gape = (2);
 16 | 
 17 |     print_out = (0);
 18 |     n_threads = (1);
 19 |     // For AGAThA
 20 |     slice_width = (3);
 21 |     z_threshold = (400);
 22 |     band_width = (751);
 23 |     kernel_align_num = (8192);
 24 |     kernel_block_num = (256);
 25 |     kernel_thread_num = (256);
 26 | 
 27 |     isPacked = false;
 28 |     isReverseComplement = false;
 29 | 
 30 |     query_batch_fasta_filename = "";
 31 |     target_batch_fasta_filename = "";
 32 |     raw_filename = "";
 33 | 
 34 |     argc = argc_;
 35 |     argv = argv_;
 36 | 
 37 | }
 38 | 
 39 | Parameters::~Parameters() {
 40 |     query_batch_fasta.close();
 41 |     target_batch_fasta.close();
 42 |     raw_file.close();
 43 | }
 44 | 
 45 | void Parameters::print() {
 46 |     std::cerr <<  "sa=" << sa <<" , sb=" << sb <<" , gapo=" <<  gapo << " , gape="<<gape << std::endl;
 47 |     std::cerr <<  "slice_width=" << slice_width << ", z_threshold=" << z_threshold << ", band_width=" << band_width << std::endl;
 48 |     std::cerr <<  "kernel launch: block_num=" << kernel_block_num << ", thread_num=" << kernel_thread_num << ", align_num=" << kernel_align_num << std::endl;
 49 |     std::cerr <<  "print_out=" << print_out <<" , n_threads=" <<  n_threads << std::endl;
 50 |     std::cerr <<  std::boolalpha << "isPacked = " << isPacked  << std::endl;
 51 |     std::cerr <<  "query_batch_fasta_filename=" << query_batch_fasta_filename <<" , target_batch_fasta_filename=" << target_batch_fasta_filename << std::endl;
 52 | }
 53 | 
 54 | void Parameters::failure(fail_type f) {
 55 |     switch(f)
 56 |     {
 57 |             case NOT_ENOUGH_ARGS:
 58 |                 std::cerr << "Not enough Parameters. Required: -y AL_TYPE file1.fasta file2.fasta. See help (--help, -h) for usage. " << std::endl;
 59 |             break;
 60 |             case WRONG_ARG:
 61 |                 std::cerr << "Wrong argument. See help (--help, -h) for usage. " << std::endl;
 62 |             break;
 63 |             case WRONG_FILES:
 64 |                 std::cerr << "File error: either a file doesn't exist, or cannot be opened." << std::endl;
 65 |             break;
 66 | 
 67 |             default:
 68 |             break;
 69 |     }
 70 |     exit(1);
 71 | }
 72 | 
 73 | void Parameters::help() {
 74 |             std::cerr << "Usage: ./test_prog.out [-m] [-x] [-q] [-r] [-s] [-z] [-w] [-b] [-t] [-a] [-p] [-n] <query_batch.fasta> <target_batch.fasta>" << std::endl;
 75 |             std::cerr << "Options: -m INT    match score ["<< sa <<"]" << std::endl;
 76 |             std::cerr << "         -x INT    mismatch penalty [" << sb << "]"<< std::endl;
 77 |             std::cerr << "         -q INT    gap open penalty [" << gapo << "]" << std::endl;
 78 |             std::cerr << "         -r INT    gap extension penalty ["<< gape <<"]" << std::endl;
 79 |             std::cerr << "         -s        (AGAThA) slice_width" << std::endl;
 80 |             std::cerr << "         -z        (AGAThA) z-drop threshold" << std::endl;
 81 |             std::cerr << "         -w        (AGAThA) band width" << std::endl;
 82 |             std::cerr << "         -b        (AGAThA) number of blocks called per kernel" << std::endl;
 83 |             std::cerr << "         -t        (AGAThA) number of threads in a block called per kernel" << std::endl;
 84 |             std::cerr << "         -a        (AGAThA) number of alignments computed per kernel" << std::endl;
 85 |             std::cerr << "         -p        print the alignment results and time" << std::endl;
 86 |             std::cerr << "         -n INT    Number of CPU threads ["<< n_threads<<"]" << std::endl;
 87 |             std::cerr << "         --help, -h : displays this message." << std::endl;
 88 |             std::cerr << "Single-pack multi-Parameters (e.g. -sp) is not supported." << std::endl;
 89 |             std::cerr << "		  "  << std::endl;
 90 | }
 91 | 
 92 | 
 93 | void Parameters::parse() {
 94 | 
 95 |     // before testing anything, check if calling for help.
 96 |     int c;
 97 |         
 98 |     std::string arg_next = "";
 99 |     std::string arg_cur = "";
100 | 
101 |     for (c = 1; c < argc; c++)
102 |     {
103 |         arg_cur = std::string((const char*) (*(argv + c) ) );
104 |         arg_next = "";
105 |         if (!arg_cur.compare("--help") || !arg_cur.compare("-h"))
106 |         {
107 |             help();
108 |             exit(0);
109 |         }
110 |     }
111 | 
112 |     if (argc < 4)
113 |     {
114 |         failure(NOT_ENOUGH_ARGS);
115 |     }
116 | 
117 |     for (c = 1; c < argc - 3; c++)
118 |     {
119 |         arg_cur = std::string((const char*) (*(argv + c) ) );
120 |         if (arg_cur.at(0) == '-' && arg_cur.at(1) == '-' )
121 |         {
122 |             if (!arg_cur.compare("--help"))
123 |             {
124 |                 help();
125 |                 exit(0);
126 |             }
127 | 
128 |         } else if (arg_cur.at(0) == '-' )
129 |         {
130 |             if (arg_cur.length() > 2)
131 |                 failure(WRONG_ARG);
132 |             char param = arg_cur.at(1);
133 |             switch(param)
134 |             {
135 |                 case 'm':
136 |                     c++;
137 |                     arg_next = std::string((const char*) (*(argv + c) ) );
138 |                     sa = std::stoi(arg_next);
139 |                 break;
140 |                 case 'x':
141 |                     c++;
142 |                     arg_next = std::string((const char*) (*(argv + c) ) );
143 |                     sb = std::stoi(arg_next);
144 |                 break;
145 |                 case 'q':
146 |                     c++;
147 |                     arg_next = std::string((const char*) (*(argv + c) ) );
148 |                     gapo = std::stoi(arg_next);
149 |                 break;
150 |                 case 'r':
151 |                     c++;
152 |                     arg_next = std::string((const char*) (*(argv + c) ) );
153 |                     gape = std::stoi(arg_next);
154 |                 break;
155 |                 case 'p':
156 |                     print_out = 1;
157 |                 break;
158 |                 case 'n':
159 |                     c++;
160 |                     arg_next = std::string((const char*) (*(argv + c) ) );
161 |                     n_threads = std::stoi(arg_next);
162 |                 break;
163 |                 case 's':
164 |                     c++;
165 |                     arg_next = std::string((const char*) (*(argv + c) ) );
166 |                     slice_width = std::stoi(arg_next);
167 |                 break;
168 |                 case 'z':
169 |                     c++;
170 |                     arg_next = std::string((const char*) (*(argv + c) ) );
171 |                     z_threshold = std::stoi(arg_next);
172 |                 break;
173 |                 case 'w':
174 |                     c++;
175 |                     arg_next = std::string((const char*) (*(argv + c) ) );
176 |                     band_width = std::stoi(arg_next);
177 |                 break;
178 |                 case 'b':
179 |                     c++;
180 |                     arg_next = std::string((const char*) (*(argv + c) ) );
181 |                     kernel_block_num = std::stoi(arg_next);
182 |                 break;
183 |                 case 't':
184 |                     c++;
185 |                     arg_next = std::string((const char*) (*(argv + c) ) );
186 |                     kernel_thread_num = std::stoi(arg_next);
187 |                 break;
188 |                 case 'a':
189 |                     c++;
190 |                     arg_next = std::string((const char*) (*(argv + c) ) );
191 |                     kernel_align_num = std::stoi(arg_next);
192 |                 break;
193 | 
194 |             }
195 | 
196 |             
197 |         } else {
198 |             failure(WRONG_ARG);
199 |         }
200 |     }
201 | 
202 | 
203 |     // the last 2 Parameters are the 2 filenames.
204 |     query_batch_fasta_filename = std::string( (const char*)  (*(argv + c) ) );
205 |     c++;
206 |     target_batch_fasta_filename = std::string( (const char*) (*(argv + c) ) );
207 | 
208 |     if (print_out) {
209 |         c++;
210 |         raw_filename = std::string( (const char*) (*(argv + c) ) );
211 |     }
212 | 
213 |     // Parameters retrieved successfully, open files.
214 |     fileopen();
215 | }
216 | 
217 | void Parameters::fileopen() {
218 |     query_batch_fasta.open(query_batch_fasta_filename, std::ifstream::in);
219 |     if (!query_batch_fasta)
220 |         failure(WRONG_FILES);
221 | 
222 |     target_batch_fasta.open(target_batch_fasta_filename);
223 |     if (!target_batch_fasta)
224 |         failure(WRONG_FILES);
225 | 
226 |     if (print_out) {
227 |         raw_file.open(raw_filename, std::ios::app);
228 |     }
229 | }
230 | 


--------------------------------------------------------------------------------
/AGAThA/src/args_parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef ARGS_PARSER_H
 2 | #define ARGS_PARSER_H
 3 | 
 4 | /*
 5 | #include <stdint.h>
 6 | 
 7 | 
 8 | #include "gasal.h"
 9 | */
10 | #include <fstream>
11 | #include <iostream>
12 | #include "gasal.h"
13 | #include <string.h>
14 | 
15 | 
16 | enum fail_type {
17 |     NOT_ENOUGH_ARGS,
18 |     TOO_MANY_ARGS,
19 |     WRONG_ARG,
20 |     WRONG_FILES,
21 |     WRONG_ALGO
22 | };
23 | 
24 | class Parameters{
25 | 
26 |     public: 
27 |         Parameters(int argc, char** argv);
28 |         ~Parameters();
29 |         void print();
30 |         void failure(fail_type f);
31 |         void help();
32 |         void parse();
33 |         void fileopen();
34 | 
35 |         int32_t sa;
36 |         int32_t sb;
37 |         int32_t gapo;
38 |         int32_t gape;
39 | 
40 |         int print_out;
41 |         int n_threads;
42 | 
43 |         int slice_width;
44 |         int z_threshold;
45 |         int band_width;
46 | 
47 |         int32_t kernel_block_num;
48 |         int32_t kernel_thread_num;
49 |         int32_t kernel_align_num;
50 | 
51 |         bool isPacked;
52 |         bool isReverseComplement;
53 | 
54 |         std::string query_batch_fasta_filename;
55 |         std::string target_batch_fasta_filename;
56 |         std::string raw_filename;
57 | 
58 |         std::ifstream query_batch_fasta;
59 |         std::ifstream target_batch_fasta;
60 |         std::ofstream raw_file;
61 | 
62 | 
63 |     protected:
64 | 
65 |     private:
66 |         int argc;
67 |         char** argv;
68 | };
69 | 
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/AGAThA/src/ctors.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "gasal.h"
  3 | 
  4 | #include "args_parser.h"
  5 | 
  6 | #include "host_batch.h"
  7 | 
  8 | #include "res.h"
  9 | 
 10 | #include "ctors.h"
 11 | 
 12 | #include "interfaces.h"
 13 | 
 14 | #include <cmath>
 15 | 
 16 | 
 17 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams) {
 18 | 	gasal_gpu_storage_v v;
 19 | 	v.a = (gasal_gpu_storage_t*)calloc(n_streams, sizeof(gasal_gpu_storage_t));
 20 | 	v.n = n_streams;
 21 | 	return v;
 22 | 
 23 | }
 24 | 
 25 | 
 26 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec,  int max_query_len, int max_target_len, int32_t maximum_sequence_length, Parameters *params) {
 27 | 
 28 | 	cudaError_t err;
 29 | 	int max_n_alns = params->kernel_align_num;
 30 | 	int i;
 31 | 	int max_query_len_8 = max_query_len % 8 ? max_query_len + (8 - (max_query_len % 8)) : max_query_len;
 32 | 	int max_target_len_8 = max_target_len % 8 ? max_target_len + (8 - (max_target_len % 8)) : max_target_len;
 33 | 
 34 | 	int host_max_query_batch_bytes = max_n_alns * max_query_len_8;
 35 | 	int gpu_max_query_batch_bytes = max_n_alns * max_query_len_8;
 36 | 	int host_max_target_batch_bytes =  max_n_alns * max_target_len_8;
 37 | 	int gpu_max_target_batch_bytes =  max_n_alns * max_target_len_8;
 38 | 	int host_max_n_alns = max_n_alns;
 39 | 	int gpu_max_n_alns = max_n_alns;
 40 | 
 41 | 
 42 | 
 43 | 	for (i = 0; i < gpu_storage_vec->n; i++) {
 44 | 
 45 | 		gpu_storage_vec->a[i].extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0);
 46 | 		gpu_storage_vec->a[i].extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0);
 47 | 
 48 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t)));
 49 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t)));
 50 | 
 51 | 
 52 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault));
 53 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault));
 54 | 		uint8_t *no_ops = NULL;
 55 | 		no_ops = (uint8_t*) calloc(host_max_n_alns * sizeof(uint8_t), sizeof(uint8_t));
 56 | 		gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, QUERY);
 57 | 		gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, TARGET);
 58 | 		free(no_ops);
 59 | 
 60 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_op), gpu_max_n_alns * sizeof(uint8_t)));
 61 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_op), gpu_max_n_alns * sizeof(uint8_t)));
 62 | 
 63 | 
 64 | 
 65 | 		if (params->isPacked)
 66 | 		{
 67 | 			gpu_storage_vec->a[i].packed_query_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_query_batch;
 68 | 			gpu_storage_vec->a[i].packed_target_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_target_batch;
 69 | 
 70 | 		} else {
 71 | 			CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
 72 | 			CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
 73 | 		}
 74 | 
 75 | 		gpu_storage_vec->a[i].host_seed_scores = NULL;
 76 | 		gpu_storage_vec->a[i].seed_scores = NULL;
 77 | 
 78 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 79 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 80 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 81 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 82 | 
 83 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
 84 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
 85 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
 86 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
 87 | 
 88 | 		// For AGAThA
 89 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].global_buffer), sizeof(short2)*(maximum_sequence_length*(params->kernel_thread_num/8)*(params->kernel_block_num)*3+(params->kernel_align_num))));
 90 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_buffer), sizeof(int32_t)*max_n_alns, cudaHostAllocDefault));
 91 | 
 92 | 		gpu_storage_vec->a[i].host_res = gasal_res_new_host(host_max_n_alns, params);
 93 | 		gpu_storage_vec->a[i].device_cpy = gasal_res_new_device_cpy(max_n_alns,  params);
 94 | 		gpu_storage_vec->a[i].device_res = gasal_res_new_device(gpu_storage_vec->a[i].device_cpy);
 95 | 
 96 | 		gpu_storage_vec->a[i].host_res_second = NULL;
 97 | 		gpu_storage_vec->a[i].device_cpy_second = NULL;
 98 | 		gpu_storage_vec->a[i].device_res_second = NULL;
 99 | 
100 | 		CHECKCUDAERROR(cudaStreamCreate(&(gpu_storage_vec->a[i].str)));
101 | 		gpu_storage_vec->a[i].is_free = 1;
102 | 		gpu_storage_vec->a[i].host_max_query_batch_bytes = host_max_query_batch_bytes;
103 | 		gpu_storage_vec->a[i].host_max_target_batch_bytes = host_max_target_batch_bytes;
104 | 		gpu_storage_vec->a[i].host_max_n_alns = host_max_n_alns;
105 | 		gpu_storage_vec->a[i].gpu_max_query_batch_bytes = gpu_max_query_batch_bytes;
106 | 		gpu_storage_vec->a[i].gpu_max_target_batch_bytes = gpu_max_target_batch_bytes;
107 | 		gpu_storage_vec->a[i].gpu_max_n_alns = gpu_max_n_alns;
108 | 		gpu_storage_vec->a[i].current_n_alns = 0;
109 | 		// For AGAThA
110 | 		gpu_storage_vec->a[i].slice_width = params->slice_width;
111 | 		gpu_storage_vec->a[i].maximum_sequence_length = maximum_sequence_length;
112 | 	}
113 | }
114 | 
115 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params) {
116 | 
117 | 	cudaError_t err;
118 | 
119 | 	int i;
120 | 	for (i = 0; i < gpu_storage_vec->n; i ++) {
121 | 		
122 | 		gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_query_batch);
123 | 		gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_target_batch);
124 | 
125 | 		gasal_res_destroy_host(gpu_storage_vec->a[i].host_res);
126 | 		gasal_res_destroy_device(gpu_storage_vec->a[i].device_res, gpu_storage_vec->a[i].device_cpy);
127 | 
128 | 		if (gpu_storage_vec->a[i].seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].seed_scores));
129 | 		if (gpu_storage_vec->a[i].host_seed_scores != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_seed_scores));
130 | 
131 | 
132 | 		if (gpu_storage_vec->a[i].query_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_op));
133 | 		if (gpu_storage_vec->a[i].target_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_op));
134 | 		if (gpu_storage_vec->a[i].host_query_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_op));
135 | 		if (gpu_storage_vec->a[i].host_target_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_op));
136 | 
137 | 		if (gpu_storage_vec->a[i].host_query_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_offsets));
138 | 		if (gpu_storage_vec->a[i].host_target_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_offsets));
139 | 		if (gpu_storage_vec->a[i].host_query_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_lens));
140 | 		if (gpu_storage_vec->a[i].host_target_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_lens));
141 | 		//if (gpu_storage_vec->a[i].host_res->cigar != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_res->cigar));
142 | 
143 | 		// For AGAThA
144 | 		if (gpu_storage_vec->a[i].global_buffer != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].global_buffer));
145 | 		if (gpu_storage_vec->a[i].host_buffer	!= NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_buffer));
146 | 
147 | 		if (gpu_storage_vec->a[i].unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_query_batch));
148 | 		if (gpu_storage_vec->a[i].unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_target_batch));
149 | 		if (!(params->isPacked))
150 | 		{
151 | 			if (gpu_storage_vec->a[i].packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_query_batch));
152 | 			if (gpu_storage_vec->a[i].packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_target_batch));
153 | 		}
154 | 
155 | 
156 | 		if (gpu_storage_vec->a[i].query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_offsets));
157 | 		if (gpu_storage_vec->a[i].target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_offsets));
158 | 		if (gpu_storage_vec->a[i].query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_lens));
159 | 		if (gpu_storage_vec->a[i].target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_lens));
160 | 		if (gpu_storage_vec->a[i].packed_tb_matrices != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_tb_matrices));
161 | 
162 | 		if (gpu_storage_vec->a[i].str != NULL)CHECKCUDAERROR(cudaStreamDestroy(gpu_storage_vec->a[i].str));
163 | 	}
164 | 
165 | 
166 | 
167 | }
168 | 
169 | 
170 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec) {
171 | 
172 | 	if(gpu_storage_vec->a != NULL) free(gpu_storage_vec->a);
173 | }
174 | 
175 | 
176 | 
177 | 
178 | // Deprecated
179 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, int gpu_max_n_alns, Parameters *params) {
180 | 
181 | 	cudaError_t err;
182 | 	//	if (gpu_storage->gpu_max_query_batch_bytes % 8) {
183 | 	//		fprintf(stderr, "[GASAL ERROR:] max_query_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_query_batch_bytes % 8);
184 | 	//		exit(EXIT_FAILURE);
185 | 	//	}
186 | 	//	if (gpu_storage->gpu_max_target_batch_bytes % 8) {
187 | 	//		fprintf(stderr, "[GASAL ERROR:] max_target_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_target_batch_bytes % 8);
188 | 	//		exit(EXIT_FAILURE);
189 | 	//	}
190 | 
191 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t)));
192 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t)));
193 | 
194 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
195 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
196 | 
197 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
198 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
199 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
200 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
201 | 
202 | 	gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy);
203 | 
204 | 	gpu_storage->gpu_max_query_batch_bytes = gpu_max_query_batch_bytes;
205 | 	gpu_storage->gpu_max_target_batch_bytes = gpu_max_target_batch_bytes;
206 | 	gpu_storage->gpu_max_n_alns = gpu_max_n_alns;
207 | 
208 | }
209 | 
210 | // Deprecated
211 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params) {
212 | 
213 | 	cudaError_t err;
214 | 
215 | 	if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch));
216 | 	if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch));
217 | 	if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch));
218 | 	if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch));
219 | 	if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets));
220 | 	if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets));
221 | 	if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens));
222 | 	if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens));
223 | 	
224 | 	gasal_res_destroy_device(gpu_storage->device_res,gpu_storage->device_cpy);
225 | 
226 | }
227 | 


--------------------------------------------------------------------------------
/AGAThA/src/ctors.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CTORS_H__
 2 | #define __CTORS_H__
 3 | 
 4 | 
 5 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams);
 6 | 
 7 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec,  int max_query_len, int max_target_len, int32_t maximum_sequence_length, Parameters *params);
 8 | 
 9 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, Parameters *params);
10 | 
11 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params);
12 | 
13 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params);
14 | 
15 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec);
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/AGAThA/src/gasal.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GASAL_H__
  2 | #define __GASAL_H__
  3 | 
  4 | 
  5 | #include <stdlib.h>
  6 | #include <stdint.h>
  7 | 
  8 | #include "/usr/local/cuda-11.7/targets/x86_64-linux/include/cuda_runtime.h"
  9 | 
 10 | #ifndef HOST_MALLOC_SAFETY_FACTOR
 11 | #define HOST_MALLOC_SAFETY_FACTOR 5
 12 | #endif
 13 | 
 14 | #define CHECKCUDAERROR(error) \
 15 | 		do{\
 16 | 			err = error;\
 17 | 			if (cudaSuccess != err ) { \
 18 | 				fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err,  __LINE__, __FILE__); \
 19 | 				exit(EXIT_FAILURE);\
 20 | 			}\
 21 | 		}while(0)\
 22 | 
 23 | 
 24 | inline int CudaCheckKernelLaunch()
 25 | {
 26 | 	cudaError err = cudaGetLastError();
 27 | 	if ( cudaSuccess != err )
 28 | 	{
 29 | 		return -1;
 30 | 	}
 31 | 
 32 | 	return 0;
 33 | }
 34 | 
 35 | 
 36 | enum comp_start{
 37 | 	WITHOUT_START,
 38 | 	WITH_START,
 39 | 	WITH_TB
 40 | };
 41 | 
 42 | // Generic enum for ture/false. Using this instead of bool to generalize templates out of Int values for secondBest. 
 43 | // Can be usd more generically, for example for WITH_/WITHOUT_START.
 44 | enum Bool{
 45 | 	FALSE,
 46 | 	TRUE
 47 | };
 48 | 
 49 | enum data_source{
 50 | 	NONE,
 51 | 	QUERY,
 52 | 	TARGET,
 53 | 	BOTH
 54 | };
 55 | 
 56 | enum algo_type{
 57 | 	UNKNOWN,
 58 | 	GLOBAL,
 59 | 	SEMI_GLOBAL,
 60 | 	LOCAL,
 61 | 	MICROLOCAL,
 62 | 	BANDED,
 63 | 	KSW
 64 | };
 65 | 
 66 | enum operation_on_seq{
 67 | 	FORWARD_NATURAL,
 68 | 	REVERSE_NATURAL,
 69 | 	FORWARD_COMPLEMENT,
 70 | 	REVERSE_COMPLEMENT,
 71 | };
 72 | 
 73 | // data structure of linked list to allow extension of memory on host side.
 74 | struct host_batch{
 75 | 	uint8_t *data;
 76 | 	uint32_t page_size;
 77 | 	uint32_t data_size;
 78 | 	uint32_t offset;
 79 | 	int is_locked;
 80 | 	struct host_batch* next;
 81 | };
 82 | typedef struct host_batch host_batch_t;
 83 | 
 84 | // Data structure to hold results. Can be instantiated for host or device memory (see res.cpp)
 85 | struct gasal_res{
 86 | 	int32_t *aln_score;
 87 | 	int32_t *query_batch_end;
 88 | 	int32_t *target_batch_end;
 89 | 	int32_t *query_batch_start;
 90 | 	int32_t *target_batch_start;
 91 | 	uint8_t *cigar;
 92 | 	uint32_t *n_cigar_ops;
 93 | };
 94 | typedef struct gasal_res gasal_res_t;
 95 | 
 96 | //stream data
 97 | typedef struct {
 98 | 	uint8_t *unpacked_query_batch;
 99 | 	uint8_t *unpacked_target_batch;
100 | 	uint32_t *packed_query_batch;
101 | 	uint32_t *packed_target_batch;
102 | 	uint32_t *query_batch_offsets;
103 | 	uint32_t *target_batch_offsets;
104 | 	uint32_t *query_batch_lens;
105 | 	uint32_t *target_batch_lens;
106 | 
107 | 	uint32_t *host_seed_scores;
108 | 	uint32_t *seed_scores;
109 | 	
110 | 	host_batch_t *extensible_host_unpacked_query_batch;
111 | 	host_batch_t *extensible_host_unpacked_target_batch;
112 | 
113 | 	uint8_t *host_query_op;
114 | 	uint8_t *host_target_op;
115 | 	uint8_t *query_op;
116 | 	uint8_t *target_op;
117 | 
118 | 	uint32_t *host_query_batch_offsets;
119 | 	uint32_t *host_target_batch_offsets;
120 | 	uint32_t *host_query_batch_lens;
121 | 	uint32_t *host_target_batch_lens;
122 | 
123 | 	gasal_res_t *host_res; // the results that can be read on host - THE STRUCT IS ON HOST SIDE, ITS CONTENT IS ON HOST SIDE.
124 | 	gasal_res_t *device_cpy; // a struct that contains the pointers to the device side - THE STRUCT IS ON HOST SIDE, but the CONTENT is malloc'd on and points to the DEVICE SIDE
125 | 	gasal_res_t *device_res; // the results that are written on device - THE STRUCT IS ON DEVICE SIDE, ITS CONTENT POINTS TO THE DEVICE SIDE.
126 | 
127 | 	gasal_res_t *host_res_second; 
128 | 	gasal_res_t *device_res_second; 
129 | 	gasal_res_t *device_cpy_second;
130 | 
131 | 	uint32_t gpu_max_query_batch_bytes;
132 | 	uint32_t gpu_max_target_batch_bytes;
133 | 
134 | 	uint32_t host_max_query_batch_bytes;
135 | 	uint32_t host_max_target_batch_bytes;
136 | 	
137 | 	uint32_t gpu_max_n_alns;
138 | 	uint32_t host_max_n_alns;
139 | 	uint32_t current_n_alns;
140 | 
141 | 	uint64_t packed_tb_matrix_size;
142 | 	uint4 *packed_tb_matrices;
143 | 
144 | 	//for AGAThA
145 | 	int32_t slice_width;
146 | 	uint32_t maximum_sequence_length;
147 | 	short2 *global_buffer;
148 | 	short2 *host_buffer;
149 | 
150 | 
151 | 	cudaStream_t str;
152 | 	int is_free;
153 | 	int id; //this can be useful in cases where a gasal_gpu_storage only contains PARTS of an alignment (like a seed-extension...), to gather results.
154 | 
155 | } gasal_gpu_storage_t;
156 | 
157 | //vector of streams
158 | typedef struct {
159 | 	int n;
160 | 	gasal_gpu_storage_t *a;
161 | }gasal_gpu_storage_v;
162 | 
163 | 
164 | //match/mismatch and gap penalties
165 | typedef struct{
166 | 	int32_t match;
167 | 	int32_t mismatch;
168 | 	int32_t gap_open;
169 | 	int32_t gap_extend;
170 | 	int32_t slice_width;
171 | 	int32_t z_threshold;
172 | 	int32_t band_width;
173 | } gasal_subst_scores;
174 | 
175 | 
176 | #endif
177 | 


--------------------------------------------------------------------------------
/AGAThA/src/gasal_align.cu:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | #include "args_parser.h"
  3 | #include "res.h"
  4 | #include "gasal_align.h"
  5 | #include "gasal_kernels.h"
  6 | #include "host_batch.h"
  7 | #include <algorithm>
  8 | #include <unistd.h>
  9 | 
 10 | inline void agatha_kernel_launcher(int32_t kernel_block_num, int32_t kernel_thread_num, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns)
 11 | {
 12 | 
 13 | 	/*Sort for Uneven Bucketing*/
 14 | 	agatha_sort<<<kernel_block_num, kernel_thread_num, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, actual_n_alns, gpu_storage->maximum_sequence_length, gpu_storage->global_buffer); 
 15 | 	cudaMemcpyAsync((void*)(gpu_storage->host_buffer), (const void*)(gpu_storage->global_buffer+kernel_block_num*(kernel_thread_num/8)*(gpu_storage->maximum_sequence_length)*3), actual_n_alns * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpu_storage->str);
 16 | 	cudaStreamSynchronize(gpu_storage->str);
 17 | 	std::sort(gpu_storage->host_buffer, gpu_storage->host_buffer+actual_n_alns, [](short2 a, short2 b){ return a.x<b.x;});
 18 | 	cudaMemcpyAsync((void*)(gpu_storage->global_buffer+kernel_block_num*(kernel_thread_num/8)*(gpu_storage->maximum_sequence_length)*3), (const void*)(gpu_storage->host_buffer), actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str);
 19 | 	
 20 | 	agatha_kernel<<<kernel_block_num, kernel_thread_num, (kernel_thread_num/32)*((32*(8*(gpu_storage->slice_width+1)))+28)*sizeof(int32_t), gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->device_res_second, gpu_storage->packed_tb_matrices, actual_n_alns, gpu_storage->maximum_sequence_length, gpu_storage->global_buffer); 
 21 | 
 22 | 
 23 | }
 24 | 
 25 | 
 26 | //GASAL2 asynchronous (a.k.a non-blocking) alignment function
 27 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params) {
 28 | 	
 29 | 	int32_t kernel_block_num = params->kernel_block_num;
 30 | 	int32_t kernel_thread_num = params->kernel_thread_num;
 31 | 
 32 | 	cudaError_t err;
 33 | 	if (actual_n_alns <= 0) {
 34 | 		fprintf(stderr, "[GASAL ERROR:] actual_n_alns <= 0\n");
 35 | 		exit(EXIT_FAILURE);
 36 | 	}
 37 | 	if (actual_query_batch_bytes <= 0) {
 38 | 		fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes <= 0\n");
 39 | 		exit(EXIT_FAILURE);
 40 | 	}
 41 | 	if (actual_target_batch_bytes <= 0) {
 42 | 		fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes <= 0\n");
 43 | 		exit(EXIT_FAILURE);
 44 | 	}
 45 | 
 46 | 	if (actual_query_batch_bytes % 8) {
 47 | 		fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes=%d is not a multiple of 8\n", actual_query_batch_bytes);
 48 | 		exit(EXIT_FAILURE);
 49 | 	}
 50 | 	if (actual_target_batch_bytes % 8) {
 51 | 		fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes=%d is not a multiple of 8\n", actual_target_batch_bytes);
 52 | 		exit(EXIT_FAILURE);
 53 | 	}
 54 | 
 55 | 	if (actual_query_batch_bytes > gpu_storage->host_max_query_batch_bytes) {
 56 | 				fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes(%d) > host_max_query_batch_bytes(%d)\n", actual_query_batch_bytes, gpu_storage->host_max_query_batch_bytes);
 57 | 				exit(EXIT_FAILURE);
 58 | 	}
 59 | 
 60 | 	if (actual_target_batch_bytes > gpu_storage->host_max_target_batch_bytes) {
 61 | 			fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes(%d) > host_max_target_batch_bytes(%d)\n", actual_target_batch_bytes, gpu_storage->host_max_target_batch_bytes);
 62 | 			exit(EXIT_FAILURE);
 63 | 	}
 64 | 
 65 | 	if (actual_n_alns > gpu_storage->host_max_n_alns) {
 66 | 			fprintf(stderr, "[GASAL ERROR:] actual_n_alns(%d) > host_max_n_alns(%d)\n", actual_n_alns, gpu_storage->host_max_n_alns);
 67 | 			exit(EXIT_FAILURE);
 68 | 	}
 69 | 
 70 | 	//--------------if pre-allocated memory is less, allocate more--------------------------
 71 | 	if (gpu_storage->gpu_max_query_batch_bytes < actual_query_batch_bytes) {
 72 | 
 73 | 		int i = 2;
 74 | 		while ( (gpu_storage->gpu_max_query_batch_bytes * i) < actual_query_batch_bytes) i++;
 75 | 
 76 | 		//fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated GPU memory (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i);
 77 | 
 78 | 		gpu_storage->gpu_max_query_batch_bytes = gpu_storage->gpu_max_query_batch_bytes * i;
 79 | 
 80 | 		if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch));
 81 | 		if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch));
 82 | 
 83 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t)));
 84 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_storage->gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
 85 | 	}
 86 | 
 87 | 	if (gpu_storage->gpu_max_target_batch_bytes < actual_target_batch_bytes) {
 88 | 
 89 | 		int i = 2;
 90 | 		while ( (gpu_storage->gpu_max_target_batch_bytes * i) < actual_target_batch_bytes) i++;
 91 | 		
 92 | 		//fprintf(stderr, "[GASAL WARNING:] actual_target_batch_bytes(%d) > Allocated GPU memory (gpu_max_target_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_target_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes*i, gpu_storage->gpu_max_target_batch_bytes*i);
 93 | 
 94 | 		gpu_storage->gpu_max_target_batch_bytes = gpu_storage->gpu_max_target_batch_bytes * i;
 95 | 
 96 | 		if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch));
 97 | 		if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch));
 98 | 
 99 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_storage->gpu_max_target_batch_bytes * sizeof(uint8_t)));
100 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_storage->gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
101 | 
102 | 
103 | 	}
104 | 
105 | 	if (gpu_storage->gpu_max_n_alns < actual_n_alns) {
106 | 
107 | 		int i = 2;
108 | 		while ( (gpu_storage->gpu_max_n_alns * i) < actual_n_alns) i++;
109 | 		
110 | 		//fprintf(stderr, "[GASAL WARNING:] actual_n_alns(%d) > gpu_max_n_alns(%d). Therefore, allocating memory for %d alignments on  GPU (gpu_max_n_alns=%d). Performance may be lost if this is repeated many times.\n", actual_n_alns, gpu_storage->gpu_max_n_alns, gpu_storage->gpu_max_n_alns*i, gpu_storage->gpu_max_n_alns*i);
111 | 
112 | 		gpu_storage->gpu_max_n_alns = gpu_storage->gpu_max_n_alns * i;
113 | 
114 | 		if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets));
115 | 		if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets));
116 | 		if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens));
117 | 		if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens));
118 | 
119 | 		if (gpu_storage->seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->seed_scores));
120 | 
121 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
122 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
123 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
124 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
125 | 
126 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->seed_scores), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
127 | 
128 | 		gasal_res_destroy_device(gpu_storage->device_res, gpu_storage->device_cpy);
129 | 		gpu_storage->device_cpy = gasal_res_new_device_cpy(gpu_storage->gpu_max_n_alns, params);
130 | 		gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy);
131 | 
132 | 
133 | 	}
134 | 	//------------------------------------------
135 | 
136 | 	//------------------------launch copying of sequence batches from CPU to GPU---------------------------
137 | 
138 | 	// here you can track the evolution of your data structure processing with the printer: gasal_host_batch_printall(current);
139 | 
140 | 	host_batch_t *current = gpu_storage->extensible_host_unpacked_query_batch;
141 | 	while (current != NULL)
142 | 	{
143 | 		//gasal_host_batch_printall(current);
144 | 		CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_query_batch[current->offset]), 
145 | 										current->data, 
146 | 										current->data_size,
147 | 										cudaMemcpyHostToDevice, 
148 | 										gpu_storage->str ) );
149 | 
150 | 		current = current->next;
151 | 	}
152 | 
153 | 	current = gpu_storage->extensible_host_unpacked_target_batch;
154 | 	while (current != NULL)
155 | 	{
156 | 		CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_target_batch[current->offset]), 
157 | 										current->data, 
158 | 										current->data_size,
159 | 										cudaMemcpyHostToDevice, 
160 | 										gpu_storage->str ) );
161 | 
162 | 		current = current->next;
163 | 	}
164 | 
165 | 	//-----------------------------------------------------------------------------------------------------------
166 | 
167 |     int query_batch_tasks_per_thread = (int)ceil((double)actual_query_batch_bytes/(8*kernel_thread_num*kernel_block_num));
168 |     int target_batch_tasks_per_thread = (int)ceil((double)actual_target_batch_bytes/(8*kernel_thread_num*kernel_block_num));
169 | 
170 | 
171 |     //-------------------------------------------launch packing kernel
172 | 
173 | 
174 | 	if (!(params->isPacked))
175 | 	{
176 | 		gasal_pack_kernel<<<kernel_block_num, kernel_thread_num, 0, gpu_storage->str>>>((uint32_t*)(gpu_storage->unpacked_query_batch),
177 | 		(uint32_t*)(gpu_storage->unpacked_target_batch), gpu_storage->packed_query_batch, gpu_storage->packed_target_batch,
178 | 		query_batch_tasks_per_thread, target_batch_tasks_per_thread, actual_query_batch_bytes/4, actual_target_batch_bytes/4);
179 | 		cudaError_t pack_kernel_err = cudaGetLastError();
180 | 		if ( cudaSuccess != pack_kernel_err )
181 | 		{
182 | 		fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(pack_kernel_err), pack_kernel_err,  __LINE__, __FILE__);
183 | 		exit(EXIT_FAILURE);
184 | 		}
185 | 	}
186 |     
187 | 
188 | 	// We could reverse-complement before packing, but we would get 2x more read-writes to memory.
189 | 
190 |     //----------------------launch copying of sequence offsets and lengths from CPU to GPU--------------------------------------
191 |     CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_lens, gpu_storage->host_query_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str));
192 |     CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_lens, gpu_storage->host_target_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice,  gpu_storage->str));
193 |     CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_offsets, gpu_storage->host_query_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice,  gpu_storage->str));
194 | 	CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_offsets, gpu_storage->host_target_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice,  gpu_storage->str));
195 | 	
196 |     //--------------------------------------------------------------------------------------------------------------------------
197 | 
198 | 	//----------------------launch copying of sequence operations (reverse/complement) from CPU to GPU--------------------------
199 | 	if (params->isReverseComplement)
200 | 	{
201 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_op, gpu_storage->host_query_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice,  gpu_storage->str));
202 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_op, gpu_storage->host_target_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice,  gpu_storage->str));	
203 | 		//--------------------------------------launch reverse-complement kernel------------------------------------------------------
204 | 		gasal_reversecomplement_kernel<<<kernel_block_num, kernel_thread_num, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
205 | 			gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->query_op, gpu_storage->target_op, actual_n_alns);
206 | 		cudaError_t reversecomplement_kernel_err = cudaGetLastError();
207 | 		if ( cudaSuccess != reversecomplement_kernel_err )
208 | 		{
209 | 			 fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(reversecomplement_kernel_err), reversecomplement_kernel_err,  __LINE__, __FILE__);
210 | 			 exit(EXIT_FAILURE);
211 | 		}
212 | 	
213 | 	}
214 | 	
215 |     //--------------------------------------launch alignment kernels--------------------------------------------------------------
216 | 
217 | 	cudaFuncSetAttribute(agatha_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, (kernel_thread_num/32)*((32*(8*(gpu_storage->slice_width+1)))+28)*sizeof(int32_t));
218 | 	
219 | 	if (params->print_out) {
220 | 		float mill = 0;
221 | 		cudaEvent_t begin, end;
222 | 		cudaEventCreate(&begin);
223 | 		cudaEventCreate(&end);
224 | 		cudaEventRecord(begin);
225 | 
226 | 		agatha_kernel_launcher(params->kernel_block_num, params->kernel_thread_num, gpu_storage, actual_n_alns);
227 | 		
228 | 		cudaDeviceSynchronize();
229 | 		cudaEventRecord(end);
230 | 		cudaEventSynchronize(end);
231 | 		
232 | 		cudaEventElapsedTime(&mill, begin, end);
233 | 		params->raw_file << mill << std::endl;
234 | 
235 | 		cudaEventDestroy(begin);
236 | 		cudaEventDestroy(end);
237 | 	} else {
238 | 		agatha_kernel_launcher(params->kernel_block_num, params->kernel_thread_num, gpu_storage, actual_n_alns);
239 | 	}
240 | 	
241 | 	
242 | 	
243 | 
244 |         //-----------------------------------------------------------------------------------------------------------------------
245 |     cudaError_t aln_kernel_err = cudaGetLastError();
246 |     if ( cudaSuccess != aln_kernel_err )
247 |     {
248 |     	fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err,  __LINE__, __FILE__);
249 |     	exit(EXIT_FAILURE);
250 |     }
251 | 
252 |     //------------------------0launch the copying of alignment results from GPU to CPU--------------------------------------
253 |     if (gpu_storage->host_res->aln_score != NULL && gpu_storage->device_cpy->aln_score != NULL) 
254 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->aln_score, gpu_storage->device_cpy->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
255 |     
256 | 	if (gpu_storage->host_res->query_batch_start != NULL && gpu_storage->device_cpy->query_batch_start != NULL) 
257 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_start, gpu_storage->device_cpy->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
258 |     
259 | 	if (gpu_storage->host_res->target_batch_start != NULL && gpu_storage->device_cpy->target_batch_start != NULL) 
260 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_start, gpu_storage->device_cpy->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
261 |     
262 | 	if (gpu_storage->host_res->query_batch_end != NULL && gpu_storage->device_cpy->query_batch_end != NULL) 
263 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_end, gpu_storage->device_cpy->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
264 |     
265 | 	if (gpu_storage->host_res->target_batch_end != NULL && gpu_storage->device_cpy->target_batch_end != NULL) 
266 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_end, gpu_storage->device_cpy->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
267 | 	
268 | 	//-----------------------------------------------------------------------------------------------------------------------
269 | 	
270 | 
271 | 
272 |     gpu_storage->is_free = 0; //set the availability of current stream to false
273 | }
274 | 
275 | 
276 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage) 
277 | {
278 | 	cudaError_t err;
279 | 	if(gpu_storage->is_free == 1) return -2;//if no work is launced in this stream, return -2
280 | 	err = cudaStreamQuery(gpu_storage->str);//check to see if the stream is finished
281 | 	if (err != cudaSuccess ) {
282 | 		if (err == cudaErrorNotReady) return -1;
283 | 		else{
284 | 			fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err,  __LINE__, __FILE__);
285 | 			exit(EXIT_FAILURE);
286 | 		}
287 | 	}
288 | 	gasal_host_batch_reset(gpu_storage);
289 | 	gpu_storage->is_free = 1;
290 | 	gpu_storage->current_n_alns = 0;
291 | 	return 0;
292 | }
293 | 
294 | 
295 | void gasal_copy_subst_scores(gasal_subst_scores *subst){
296 | 
297 | 	cudaError_t err;
298 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapO, &(subst->gap_open), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
299 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapExtend, &(subst->gap_extend), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
300 | 	int32_t gapoe = (subst->gap_open + subst->gap_extend);
301 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapOE, &(gapoe), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
302 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMatchScore, &(subst->match), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
303 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMismatchScore, &(subst->mismatch), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
304 | 	// For AGAThA
305 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaSliceWidth, &(subst->slice_width), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
306 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaZThreshold, &(subst->z_threshold), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
307 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaBandWidth, &(subst->band_width), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
308 | 	return;
309 | }
310 | 
311 | 


--------------------------------------------------------------------------------
/AGAThA/src/gasal_align.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_ALIGN_H__
 2 | #define __GASAL_ALIGN_H__
 3 | 
 4 | void gasal_copy_subst_scores(gasal_subst_scores *subst);
 5 | 
 6 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params);
 7 | 
 8 | inline void gasal_kernel_launcher(int32_t kernel_block_num, int32_t kernel_thread_num, algo_type algo, comp_start start, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns, int32_t k_band, uint32_t maximum_sequence_length);
 9 | 
10 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/AGAThA/src/gasal_header.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_HEADER_H__
 2 | #define __GASAL_HEADER_H__
 3 | 
 4 | 
 5 | #include "gasal.h"		// include cstdlib, cstdint
 6 | #include "args_parser.h" // include iostream, string, fstream
 7 | #include "gasal_align.h"
 8 | #include "host_batch.h"  // include cstdio, cstring
 9 | #include "ctors.h"
10 | #include "interfaces.h"
11 | 
12 | 
13 | 
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/AGAThA/src/gasal_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_KERNELS_H__
 2 | #define __GASAL_KERNELS_H__
 3 | 
 4 | 
 5 | // Template-meta-programming types construction from Int values
 6 | // This allows to cut down kernel code at compilation time.
 7 | 
 8 | template <int Val>
 9 | struct Int2Type
10 | {
11 | 	typedef enum {val_ = Val} val__;
12 | };
13 | 
14 | template<typename X, typename Y>
15 | struct SameType
16 | {
17 |    enum { result = 0 };
18 | };
19 | 
20 | template<typename T>
21 | struct SameType<T, T>
22 | {
23 |    enum { result = 1 };
24 | };
25 | 
26 | #define SAMETYPE(a, b) (SameType<a,b>::result)
27 | 
28 | 
29 | __constant__ int32_t _cudaGapO; /*gap open penalty*/
30 | __constant__ int32_t _cudaGapOE; /*sum of gap open and extension penalties*/
31 | __constant__ int32_t _cudaGapExtend; /*sum of gap extend*/
32 | __constant__ int32_t _cudaMatchScore; /*score for a match*/
33 | __constant__ int32_t _cudaMismatchScore; /*penalty for a mismatch*/
34 | __constant__ int32_t _cudaSliceWidth; /*(AGAThA) slice width*/
35 | __constant__ int32_t _cudaZThreshold; /*(AGAThA) zdrop threshold*/
36 | __constant__ int32_t _cudaBandWidth; /*(AGAThA) band width*/
37 | 
38 | #define MINUS_INF SHRT_MIN
39 | #define MINUS_INF2 SHRT_MIN/2
40 | 
41 | #define N_VALUE (N_CODE & 0xF)
42 | 
43 | #ifdef N_PENALTY
44 | 	#define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \
45 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
46 | 	score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\
47 | 
48 | 	#define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \
49 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
50 | 	score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\
51 | 
52 | #else
53 | 	#define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \
54 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
55 | 	score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? 0 : score;\
56 | 
57 | 	#define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \
58 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
59 | 
60 | #endif
61 | 
62 | #define MAX(a,b) ((a)>(b)?(a):(b))
63 | #define MIN(a,b) ((a)<(b)?(a):(b))
64 | 
65 | 
66 | #define FIND_MAX(curr, gidx) \
67 | 	maxXY_y = (maxHH < curr) ? gidx : maxXY_y;\
68 | maxHH = (maxHH < curr) ? curr : maxHH;
69 | 
70 | 
71 | // Kernel files
72 | 
73 | #include "kernels/pack_rc_seqs.h"
74 | 
75 | #include "kernels/agatha_kernel.h"
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/AGAThA/src/host_batch.cpp:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | #include "args_parser.h"
  3 | #include "interfaces.h"
  4 | #include "host_batch.h"
  5 | 
  6 | 
  7 | 
  8 | 
  9 | // Functions for host batches handling. 
 10 | 
 11 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset)
 12 | {
 13 | 	cudaError_t err;
 14 | 	host_batch_t *res = (host_batch_t *)calloc(1, sizeof(host_batch_t));
 15 | 	CHECKCUDAERROR(cudaHostAlloc(&(res->data), batch_bytes*sizeof(uint8_t), cudaHostAllocDefault));
 16 | 	res->page_size = batch_bytes;
 17 | 	res->data_size = 0;
 18 | 	res->is_locked = 0;
 19 | 	res->offset = offset;
 20 | 	res->next = NULL;
 21 | 	return res;
 22 | }
 23 | 
 24 | void gasal_host_batch_destroy(host_batch_t *res)
 25 | {
 26 | 	cudaError_t err;
 27 | 	if (res==NULL)
 28 | 	{
 29 | 		fprintf(stderr, "[GASAL ERROR] Trying to free a NULL pointer\n");
 30 | 		exit(1);
 31 | 	}
 32 | 	// recursive function to destroy all the linked listgasal_res_destroy_host
 33 | 	if (res->next != NULL)
 34 | 		gasal_host_batch_destroy(res->next);
 35 | 	if (res->data != NULL) 
 36 | 	{
 37 | 		CHECKCUDAERROR(cudaFreeHost(res->data));
 38 | 	}
 39 | 	
 40 | 	free(res);
 41 | }
 42 | 
 43 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg)
 44 | {
 45 | 	return (arg->next == NULL ? arg : gasal_host_batch_getlast(arg->next) );
 46 | 	
 47 | }
 48 | 
 49 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage)
 50 | {
 51 | 	// reset all batch idx and data occupation
 52 | 	host_batch_t *cur_page = NULL;
 53 | 	for(int i = 0; i < 2; i++) {
 54 | 
 55 | 		switch(i) {
 56 | 			case 0:
 57 | 				cur_page = (gpu_storage->extensible_host_unpacked_query_batch);
 58 | 			break;
 59 | 			case 1:
 60 | 				cur_page = (gpu_storage->extensible_host_unpacked_target_batch);
 61 | 			break;
 62 | 			default:
 63 | 			break;
 64 | 		}
 65 | 		while(cur_page != NULL)
 66 | 		{
 67 | 			cur_page->data_size = 0;
 68 | 			cur_page->offset = 0;
 69 | 			cur_page->is_locked = 0;
 70 | 			cur_page = cur_page->next;
 71 | 		}
 72 | 	}
 73 | 	//fprintf(stderr, "[GASAL INFO] Batch reset.\n");
 74 | 
 75 | }
 76 | 
 77 | 
 78 | // TODO: make a template... now that you started to go the C++/template way, just stick to it.
 79 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC)
 80 | {
 81 | 	// since query and target are very symmetric here, we use pointers to route the data where it has to, 
 82 | 	// while keeping the actual memory management 'source-agnostic'.
 83 | 
 84 | 	host_batch_t *cur_page = NULL;
 85 | 	uint32_t *p_batch_bytes = NULL;
 86 | 
 87 | 	switch(SRC) {
 88 | 		case QUERY:
 89 | 			cur_page = (gpu_storage->extensible_host_unpacked_query_batch);
 90 | 			p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes);
 91 | 		break;
 92 | 		case TARGET:
 93 | 			cur_page = (gpu_storage->extensible_host_unpacked_target_batch);
 94 | 			p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes);
 95 | 		break;
 96 | 		default:
 97 | 		break;
 98 | 	}
 99 | 	
100 | 	int nbr_N = 0;
101 | 	while((size+nbr_N)%8)
102 | 		nbr_N++;
103 | 
104 | 	while(cur_page->is_locked)
105 | 		cur_page = cur_page->next;
106 | 
107 | 	if (cur_page->next == NULL && cur_page->page_size - cur_page->data_size < size + nbr_N)
108 | 	{
109 | 		/*
110 | 		fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes while only %d remain (%s) (block size %d, filled %d bytes).\n                 Allocating a new block of size %d, total size available reaches %d. Doing this repeadtedly slows down the execution.\n",
111 | 				size + nbr_N,
112 | 				cur_page->page_size - cur_page->data_size,
113 | 				(SRC == QUERY ? "query":"target"),
114 | 				cur_page->page_size,
115 | 				cur_page->data_size,
116 | 				cur_page->page_size * 2,
117 | 				*p_batch_bytes + cur_page->page_size * 2);
118 | 		*/
119 | 		host_batch_t *res = gasal_host_batch_new(cur_page->page_size * 2, cur_page->offset + cur_page->data_size);
120 | 		cur_page->next = res;
121 | 		cur_page->is_locked = 1;
122 | 		*p_batch_bytes = *p_batch_bytes + cur_page->page_size * 2;
123 | 
124 | 		cur_page = cur_page->next;
125 | 		//fprintf(stderr, "CREATED: "); gasal_host_batch_print(cur_page);
126 | 	}
127 | 	
128 | 	if (cur_page->next != NULL && cur_page->page_size - cur_page->data_size < size + nbr_N)
129 | 	{
130 | 		// re-write offset for the next page to correspond to what has been filled on the current page.
131 | 		cur_page->next->offset = cur_page->offset + cur_page->data_size;
132 | 		cur_page->is_locked = 1;
133 | 		// then, jump to next page
134 | 		cur_page = cur_page->next;
135 | 	}
136 | 
137 | 
138 | 	if (cur_page->page_size - cur_page->data_size >= size + nbr_N)
139 | 	{
140 | 		// fprintf(stderr, "FILL: "); gasal_host_batch_print(cur_page);
141 | 		memcpy(&(cur_page->data[idx - cur_page->offset]), data, size);
142 | 
143 | 		for(int i = 0; i < nbr_N; i++)
144 | 		{
145 | 			cur_page->data[idx + size - cur_page->offset + i] = N_CODE;
146 | 		}
147 | 		idx = idx + size + nbr_N;
148 | 
149 | 		cur_page->data_size += size + nbr_N;
150 | 		//is_done = 1;
151 | 	}
152 | 
153 | 	return idx;
154 | }
155 | 
156 | 
157 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC )
158 | {	 
159 |     return gasal_host_batch_add(gpu_storage, idx, &base, 1, SRC );
160 | }
161 | 
162 | 
163 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC )
164 | {	
165 | 
166 | 	// since query and target are very symmetric here, we use pointers to route the data where it has to, 
167 | 	// while keeping the actual memory management 'source-agnostic'.
168 | 	host_batch_t *cur_page = NULL;
169 | 	uint32_t *p_batch_bytes = NULL;
170 | 	
171 | 
172 | 	switch(SRC) {
173 | 		case QUERY:
174 | 			cur_page = (gpu_storage->extensible_host_unpacked_query_batch);
175 | 			p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes);
176 | 		break;
177 | 		case TARGET:
178 | 			cur_page = (gpu_storage->extensible_host_unpacked_target_batch);
179 | 			p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes);
180 | 		break;
181 | 		default:
182 | 		break;
183 | 	}
184 | 
185 | 	int is_done = 0;
186 | 
187 | 	while (!is_done)
188 | 	{
189 | 		if (*p_batch_bytes >= idx + size && (cur_page->next == NULL || (cur_page->next->offset >= idx + size)) )
190 | 		{
191 | 
192 | 			memcpy(&(cur_page->data[idx - cur_page->offset]), data, size);
193 | 			idx = idx + size;
194 | 			is_done = 1;
195 | 
196 | 		} else if ((*p_batch_bytes >= idx + size) && (cur_page->next != NULL) && (cur_page->next->offset < idx + size)) {
197 | 		
198 | 			cur_page = cur_page->next;
199 | 
200 | 		} else {
201 | 			/*
202 | 			fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes at position %d on host memory (%s) while only  %d bytes are available. Therefore, allocating %d bytes more on CPU. Repeating this many times can provoke a degradation of performance.\n",
203 | 					size,
204 | 					idx,
205 | 					(SRC == QUERY ? "query":"target"),
206 | 					*p_batch_bytes,
207 | 					*p_batch_bytes * 2);
208 | 			*/
209 | 	
210 | 			*p_batch_bytes += *p_batch_bytes;
211 | 
212 | 			// corner case: if we allocated less than a single sequence length to begin with... it shouldn't be allowed actually, but at least it's caught here.
213 | 			while (*p_batch_bytes < size)
214 | 				*p_batch_bytes += *p_batch_bytes;
215 | 
216 | 			host_batch_t *res = gasal_host_batch_new(*p_batch_bytes, idx);
217 | 	
218 | 			cur_page->next = res;
219 | 			
220 | 			cur_page = cur_page->next;
221 | 		}
222 | 	}
223 | 	//gasal_host_batch_printall(gasal_host_batch_getlast(cur_page));
224 | 	return idx;
225 | }
226 | 
227 | 
228 | 
229 | // this printer displays the whole sequence. It is heavy and shouldn't be called when you have more than a couple sequences.
230 | void gasal_host_batch_print(host_batch_t *res) 
231 | {
232 | 	fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 
233 | 					res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size);
234 | }
235 | 
236 | // this printer allows to see the linked list easily.
237 | void gasal_host_batch_printall(host_batch_t *res)
238 | {	
239 | 	fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 
240 | 					res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size);
241 | 	if (res->next != NULL)
242 | 	{
243 | 		fprintf(stderr, "+--->");
244 | 		gasal_host_batch_printall(res->next);
245 | 	}
246 | }
247 | 


--------------------------------------------------------------------------------
/AGAThA/src/host_batch.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOST_BACTH_H__
 2 | #define __HOST_BACTH_H__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h> // useful for memcpy, strlen
 7 | 
 8 | // host data structure methods
 9 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset);
10 | void gasal_host_batch_destroy(host_batch_t *res); 																		// destructor
11 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg); 	
12 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage);															// get last item of chain
13 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC); 	// fill the data
14 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC );
15 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC );
16 | void gasal_host_batch_print(host_batch_t *res); 																		// printer 
17 | void gasal_host_batch_printall(host_batch_t *res);																		// printer for the whole linked list
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/AGAThA/src/interfaces.cpp:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | #include "args_parser.h"
  3 | #include "interfaces.h"
  4 | #include "res.h"
  5 | 
  6 | 
  7 | // Function for general resizing
  8 | template <typename T>
  9 | T* cudaHostRealloc(void *source, int new_size, int old_size) 
 10 | {
 11 | 	cudaError_t err;
 12 | 	T* destination = NULL;
 13 | 	if (new_size < old_size)
 14 | 	{
 15 | 		fprintf(stderr, "[GASAL ERROR] cudoHostRealloc: invalid sizes. New size < old size (%d < %d)", new_size, old_size);
 16 | 		exit(EXIT_FAILURE);
 17 | 	}
 18 | 	CHECKCUDAERROR(cudaHostAlloc(&destination, new_size * sizeof(T), cudaHostAllocMapped));
 19 | 	//fprintf(stderr, "\ndest=%p\tsrc=%p", destination, source);
 20 | 	CHECKCUDAERROR(cudaMemcpy(destination, source, old_size * sizeof(T), cudaMemcpyHostToHost));
 21 | 	CHECKCUDAERROR(cudaFreeHost(source));
 22 | 	return destination;
 23 | };
 24 | 
 25 | // Realloc new fields when more alignments are added. 
 26 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params)
 27 | {
 28 | 	/*  // Don't reallocate the extensible batches. They're extensible.
 29 | 		gpu_storage->extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0);
 30 | 		gpu_storage->extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0);
 31 | 	*/
 32 | 	/*  // don't realloc gpu-sided batches as they will be taken care of before aligning.
 33 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t)));
 34 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t)));
 35 | 	*/
 36 | 
 37 | 	fprintf(stderr, "[GASAL WARNING] Resizing gpu_storage from %d sequences to %d sequences... ", gpu_storage->host_max_n_alns,new_max_alns);
 38 | 	// don't care about realloc'ing gpu-sided fields as they will be taken care of before aligning.
 39 | 
 40 | 	gpu_storage->host_query_op =  cudaHostRealloc<uint8_t>((void*) gpu_storage->host_query_op, new_max_alns, gpu_storage->host_max_n_alns);
 41 | 	gpu_storage->host_target_op =  cudaHostRealloc<uint8_t>((void*) gpu_storage->host_target_op, new_max_alns, gpu_storage->host_max_n_alns);
 42 | 	
 43 | 	gpu_storage->host_query_batch_lens = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_query_batch_lens, new_max_alns, gpu_storage->host_max_n_alns);
 44 | 	gpu_storage->host_target_batch_lens = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_target_batch_lens, new_max_alns, gpu_storage->host_max_n_alns);
 45 | 	//fprintf(stderr, "_lens done ");
 46 | 
 47 | 	gpu_storage->host_query_batch_offsets = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_query_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns);
 48 | 	gpu_storage->host_target_batch_offsets = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_target_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns);
 49 | 	//fprintf(stderr, "_offsets done ");
 50 | 	
 51 | 	gasal_res_destroy_host(gpu_storage->host_res);
 52 | 	gpu_storage->host_res = gasal_res_new_host(new_max_alns, params);
 53 | 	gpu_storage->device_cpy = gasal_res_new_device_cpy(new_max_alns, params);
 54 | 	gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy);
 55 | 
 56 | 	gpu_storage->host_res_second = NULL;
 57 | 	gpu_storage->device_cpy_second = NULL;
 58 | 	gpu_storage->device_res_second = NULL;
 59 | 	
 60 | 	
 61 | 	//fprintf(stderr, "_res done ");
 62 | 
 63 | 	gpu_storage->host_max_n_alns = new_max_alns;
 64 | 	//gpu_storage->gpu_max_n_alns = gpu_max_n_alns;
 65 | 	fprintf(stderr, " done. This can harm performance.\n");
 66 | }
 67 | 
 68 | // operation (Reverse/complement) filler.
 69 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC)
 70 | {
 71 | 	uint8_t *host_op = NULL;
 72 | 	switch(SRC)
 73 | 	{
 74 | 		case QUERY:
 75 | 			host_op = (gpu_storage_t->host_query_op);
 76 | 		break;
 77 | 		case TARGET:
 78 | 			host_op = (gpu_storage_t->host_target_op);
 79 | 		break;
 80 | 		default:
 81 | 		break;
 82 | 	}
 83 | 	memcpy(host_op, data, nbr_seqs_in_stream);
 84 | }
 85 | 
 86 | void gasal_set_device(int gpu_select, bool isPrintingProp)
 87 | {
 88 | 	/* 
 89 | 	Select GPU
 90 | 	*/
 91 | 	if (isPrintingProp)
 92 | 	{
 93 | 		int num_devices, device;
 94 | 		cudaGetDeviceCount(&num_devices);
 95 | 		fprintf(stderr, "Found %d GPUs\n", num_devices);
 96 | 		if (gpu_select  > num_devices-1)
 97 | 		{
 98 | 			fprintf(stderr, "Error: can't select device %d when only %d devices are selected (range from 0 to %d)\n", gpu_select, num_devices, num_devices-1);
 99 | 			exit(EXIT_FAILURE);
100 | 		}
101 | 		if (num_devices > 0) {
102 | 			cudaDeviceProp properties;
103 | 			for (device = 0; device < num_devices; device++) {
104 | 					cudaGetDeviceProperties(&properties, device);
105 | 					fprintf(stderr, "\tGPU %d: %s\n", device, properties.name);
106 | 			}
107 | 			cudaGetDeviceProperties(&properties, gpu_select);
108 | 			fprintf(stderr, "Selected device %d : %s\n", gpu_select, properties.name);
109 | 			cudaSetDevice(gpu_select);
110 | 		}
111 | 	} else {
112 | 		// silently select device
113 | 		cudaSetDevice(gpu_select);
114 | 	}
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/AGAThA/src/interfaces.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_INTERFACES_H__
 2 | #define __GASAL_INTERFACES_H__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | // Resizer for the whole gpu_storage in terms of number of sequences
 9 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params);
10 | 
11 | // operation filler method (field in the gasal_gpu_storage_t field)
12 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC);
13 | 
14 | void gasal_set_device(int gpu_select = 0, bool isPrintingProp = true);
15 | #endif
16 | 


--------------------------------------------------------------------------------
/AGAThA/src/kernels/agatha_kernel.h:
--------------------------------------------------------------------------------
  1 | #ifndef __AGATHA_KERNEL__
  2 | #define __AGATHA_KERNEL__
  3 | 
  4 | 
  5 | // This old core provides the same result as the currently LOCAL core, but lacks some optimization. Left for historical / comparative purposes.
  6 | // Deprecated code from GASAL2 (left as reference)
  7 | #define CORE_LOCAL_DEPRECATED_COMPUTE() \
  8 | 		uint32_t rbase = (packed_ref_literal >> l) & 15;/*get a base from target_batch sequence */ \
  9 | 		DEV_GET_SUB_SCORE_LOCAL(temp_score, qbase, rbase);/* check equality of qbase and rbase */ \
 10 | 		f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */ \
 11 | 		h[m] = p[m] + temp_score; /*score if qbase is aligned to rbase*/ \
 12 | 		h[m] = max(h[m], f[m]); \
 13 | 		h[m] = max(h[m], 0); \
 14 | 		e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence */\
 15 | 		h[m] = max(h[m], e); \
 16 | 		max_ref_idx = (max_score < h[m]) ? ref_idx + (m-1) : max_ref_idx; \
 17 | 		max_score = (max_score < h[m]) ? h[m] : max_score; \
 18 | 		p[m] = h[m-1];
 19 | 
 20 | #define CORE_COMPUTE() \
 21 | 		uint32_t rbase = (packed_ref_literal >> l) & 15;\
 22 | 		DEV_GET_SUB_SCORE_GLOBAL(temp_score, qbase, rbase) \
 23 | 		temp_score += p[m]; \
 24 | 		h[m] = max(temp_score, f[m]); \
 25 | 		h[m] = max(h[m], e); \
 26 | 		f[m] = max(temp_score- _cudaGapOE, f[m] - _cudaGapExtend); \
 27 | 		e = max(temp_score- _cudaGapOE, e - _cudaGapExtend); \
 28 | 		p[m] = h[m-1]; \
 29 | 		diag_idx = ((ref_idx + m-1+query_idx)&(total_shm-1))<<5;\
 30 | 		antidiag_max[real_warp_id+diag_idx] = max(antidiag_max[real_warp_id+diag_idx], (h[m]<<16) +ref_idx+ m-1);\
 31 | 
 32 | #define CORE_COMPUTE_BOUNDARY() \
 33 | 		if (query_idx + _cudaBandWidth < ref_idx + m-1 || query_idx - _cudaBandWidth > ref_idx + m-1) { \
 34 | 			p[m] = h[m-1]; \
 35 | 		} else { \
 36 | 			uint32_t rbase = (packed_ref_literal >> l) & 15;\
 37 | 			DEV_GET_SUB_SCORE_GLOBAL(temp_score, qbase, rbase) \
 38 | 			temp_score += p[m]; \
 39 | 			h[m] = max(temp_score, f[m]); \
 40 | 			h[m] = max(h[m], e); \
 41 | 			f[m] = max(temp_score- _cudaGapOE, f[m] - _cudaGapExtend); \
 42 | 			e = max(temp_score- _cudaGapOE, e - _cudaGapExtend); \
 43 | 			p[m] = h[m-1]; \
 44 | 			diag_idx = ((ref_idx + m-1+query_idx)&(total_shm-1))<<5;\
 45 | 			antidiag_max[real_warp_id+diag_idx] = max(antidiag_max[real_warp_id+diag_idx], (h[m]<<16) +ref_idx+ m-1);\
 46 | 		}
 47 | 		
 48 | 
 49 | __global__ void agatha_kernel(uint32_t *packed_query_batch, uint32_t *packed_ref_batch,  uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, gasal_res_t *device_res_second, uint4 *packed_tb_matrices, int n_tasks, uint32_t max_query_len, short2 *global_buffer_top)
 50 | {
 51 |     /*Initial kernel setup*/
 52 | 
 53 | 	// Initializing variables 
 54 | 	int32_t i, k, m, l, y, e;
 55 | 	int32_t ub_idx, job_idx, ref_idx, query_idx;
 56 | 	short2 HD;
 57 | 	int32_t temp_score;
 58 | 	int slice_start, slice_end, finished_blocks, chunk_start, chunk_end;
 59 | 	int packed_ref_idx, packed_query_idx;
 60 | 	int total_anti_diags;
 61 | 	register uint32_t packed_ref_literal, packed_query_literal; 
 62 | 	bool active, terminated;
 63 | 	int32_t packed_ref_batch_idx, packed_query_batch_idx, query_len, ref_len, packed_query_len, packed_ref_len;
 64 | 	int diag_idx, temp, last_diag;
 65 | 
 66 | 	// Initializing max score and its idx
 67 |     int32_t max_score = 0; 
 68 | 	int32_t max_ref_idx = 0; 
 69 |     int32_t prev_max_score = 0;
 70 |     int32_t max_query_idx = 0;
 71 | 
 72 | 	// Setting constant values
 73 | 	const short2 initHD = make_short2(MINUS_INF2, MINUS_INF2); //used to initialize short2
 74 | 	const int32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; //thread ID within the entire kernel
 75 | 	const int packed_len = 8; //number of bps (literals) packed into a single int32
 76 | 	const int const_warp_len = 8; //number of threads per subwarp (before subwarp rejoining occurs)
 77 | 	const int real_warp_id = threadIdx.x % 32; //thread ID within a single (full 32-thread) warp
 78 | 	const int warp_per_kernel = (gridDim.x * blockDim.x) / const_warp_len; // number of subwarps. assume number of threads % const_warp_len == 0
 79 | 	const int job_per_warp = n_tasks % warp_per_kernel ? (n_tasks / warp_per_kernel + 1) : n_tasks / warp_per_kernel; //number of jobs (alignments/tasks) needed to be done by a single subwarp
 80 | 	const int job_per_query = max_query_len % const_warp_len ? (max_query_len / const_warp_len + 1) : max_query_len / const_warp_len; //number of a literal's initial score to fill per thread
 81 | 	const int job_start_idx = (tid / const_warp_len)*job_per_warp; // the boundary of jobs of a subwarp 
 82 | 	const int job_end_idx = (job_start_idx + job_per_warp) < n_tasks ? (job_start_idx + job_per_warp) : n_tasks; // the boundary of jobs of a subwarp
 83 | 	const int total_shm = packed_len*(_cudaSliceWidth+1); // amount of shared memory a single thread uses
 84 | 	
 85 | 	// Arrays for saving intermediate values
 86 | 	int32_t h[9];
 87 | 	int32_t f[9];
 88 | 	int32_t p[9];
 89 | 
 90 | 	// Global memory setup
 91 | 	short2* global_buffer_left = (short2*)(global_buffer_top+max_query_len*(blockDim.x/8)*gridDim.x);
 92 | 	int32_t* global_buffer_topleft= (int32_t*)(global_buffer_left+max_query_len*(blockDim.x/8)*gridDim.x);
 93 | 	short2* global_ub_idx = (short2*)(global_buffer_top+max_query_len*(blockDim.x/8)*gridDim.x*3);
 94 | 
 95 | 	// Shared memory setup
 96 | 	extern __shared__ int32_t shared_maxHH[];
 97 | 	int32_t* antidiag_max = (int32_t*)(shared_maxHH+(threadIdx.x/32)*total_shm*32);
 98 | 	int32_t* shared_job = shared_maxHH+(blockDim.x/32)*total_shm*32+(threadIdx.x/32)*28;
 99 | 
100 | 	/* Setup values that will change after Subwarp Rejoining */
101 | 	int warp_len = const_warp_len;
102 | 	int warp_id = threadIdx.x % warp_len; // id of a thread in a subwarp 
103 | 	int warp_num = tid / warp_len;
104 | 	// mask that is true for threads in the same subwarp
105 | 	unsigned same_threads = __match_any_sync(0xffffffff, warp_num);
106 | 	if (warp_id==0) shared_job[(warp_num&3)] = -1;
107 | 
108 | 	/* Iterating over jobs/alignments */
109 | 	for (job_idx = job_start_idx; job_idx < job_end_idx; job_idx++) {
110 | 		
111 | 		/*Uneven Bucketing*/
112 | 		// the first subwarp fetches a long sequence's idx, while the remaining subwarps fetch short sequences' idx
113 | 		ub_idx = ((job_idx&3)==0)? global_ub_idx[n_tasks-(job_idx>>2)-1].y: global_ub_idx[job_idx-(job_idx>>2)-1].y;
114 | 				
115 | 		// get target and query sequence information
116 | 		packed_ref_batch_idx = target_batch_offsets[ub_idx] >> 3; //starting index of the target_batch sequence
117 | 		packed_query_batch_idx = query_batch_offsets[ub_idx] >> 3;//starting index of the query_batch sequence
118 | 		query_len = query_batch_lens[ub_idx]; // query sequence length
119 | 		ref_len = target_batch_lens[ub_idx]; // reference sequence length 
120 | 		packed_query_len = (query_len >> 3) + (query_len & 7 ? 1 : 0);//number of 32-bit words holding query_batch sequence
121 | 		packed_ref_len = (ref_len >> 3) + (ref_len & 7 ? 1 : 0);//number of 32-bit words holding target_batch sequence
122 | 
123 | 		/*Buffer Initialization*/
124 | 		// fill global buffer with initial value
125 | 		// global_buffer_top: used to store intermediate scores H and E in the horizontal strip (scores from the top)
126 | 		for (i = 0; i < job_per_query; i++) {
127 | 			l = i*warp_len + warp_id;
128 | 			if ((l) < max_query_len) {
129 | 				k = -(_cudaGapOE + (_cudaGapExtend*(l)));
130 | 				global_buffer_top[warp_num*max_query_len + l] =  l <= _cudaBandWidth? make_short2(k, k-_cudaGapOE):initHD;	
131 | 			}
132 | 		}
133 | 		// global_buffer_left: used to store intermediate scores H and F in the vertical strip (scores from the left)
134 | 		for (i = 0; i < job_per_query; i++) {
135 | 			l = i*warp_len + warp_id;
136 | 			if ((l) < max_query_len) {
137 | 				k = -(_cudaGapOE + (_cudaGapExtend*(l)));
138 | 				global_buffer_left[warp_num*max_query_len + l] =  l <= _cudaBandWidth? make_short2(k, k-_cudaGapOE):initHD;	
139 | 			}
140 | 		}
141 | 		// global_buffer_topleft: used to store intermediate scores H in the diagonal strip (scores from the top-left)
142 | 		for (i = 0; i < job_per_query; i++) {
143 | 			l = i*warp_len + warp_id;
144 | 			if (l < max_query_len) {
145 | 				k = -(_cudaGapOE+(_cudaGapExtend*(l*packed_len-1)));
146 | 				global_buffer_topleft[warp_num*max_query_len + l] = l==0? 0: (l*packed_len-1) <= _cudaBandWidth? k: MINUS_INF2; 	
147 | 			}
148 | 		}
149 | 		
150 | 		// fill shared memory with initial value
151 | 		for (m = 0; m < total_shm; m++) {
152 | 			antidiag_max[real_warp_id + m*32] = INT_MIN;
153 | 		}
154 | 
155 | 		__syncwarp();
156 | 
157 | 		// Initialize variables
158 | 		max_score = 0; 
159 | 		prev_max_score = 0;
160 | 		max_ref_idx = 0; 
161 |     	max_query_idx = 0;
162 | 		terminated = false;
163 | 
164 | 		i = 0; //chunk
165 | 		total_anti_diags = packed_ref_len + packed_query_len-1; //chunk
166 | 
167 | 		/*Subwarp Rejoining*/
168 | 		//set shared memory that is used to maintain values for subwarp rejoining
169 | 		if (warp_id==0) shared_job[(warp_num&3)] = total_anti_diags;
170 | 		else if (warp_id==1) shared_job[4+(warp_num&3)] = packed_ref_batch_idx;
171 | 		else if (warp_id==2) shared_job[8+(warp_num&3)] = packed_query_batch_idx;
172 | 		else if (warp_id==3) shared_job[12+(warp_num&3)] = (ref_len<<16)+query_len;
173 | 		else if (warp_id==4) shared_job[16+(warp_num&3)] = ub_idx;
174 | 
175 | 		same_threads = __match_any_sync(__activemask(), warp_num);
176 | 
177 | 		__syncwarp();
178 | 
179 | 		/*Main Alignment Loop*/
180 | 		while (i < total_anti_diags) {
181 | 			
182 | 			// set boundaries for current slice
183 | 			slice_start = max(0, (i-packed_query_len+1));
184 | 			slice_start = max(slice_start, (i*packed_len + packed_len-1+1 - _cudaBandWidth)/2/packed_len);
185 | 			slice_end = min(packed_ref_len-1, i+_cudaSliceWidth-1);
186 | 			slice_end = min(slice_end, ((i+_cudaSliceWidth-1)*packed_len + packed_len-1 + _cudaBandWidth)/2/packed_len);
187 | 			finished_blocks = slice_start;
188 | 			
189 | 			if (slice_start > slice_end) {
190 | 				terminated = true;
191 | 			}
192 | 
193 | 			while (!terminated && finished_blocks <= slice_end) {
194 | 				// while the entire chunk diag is not finished
195 | 				packed_ref_idx = finished_blocks + warp_id;
196 | 				packed_query_idx = i - packed_ref_idx;
197 | 				active = (packed_ref_idx <= slice_end);	//whether the current thread has cells to fill or not
198 | 				
199 | 				if (active) {
200 | 					ref_idx = packed_ref_idx << 3;
201 | 					query_idx = packed_query_idx << 3;
202 | 
203 | 					// load intermediate values from global buffers
204 | 					p[1] = global_buffer_topleft[warp_num*max_query_len + packed_ref_idx];
205 | 
206 | 					for (m = 1; m < 9; m++) {
207 | 						if ( (ref_idx + m-1) < ref_len) {
208 | 							HD = global_buffer_left[warp_num*max_query_len + ref_idx + m-1];
209 | 							h[m] = HD.x;
210 | 							f[m] = HD.y;
211 | 						} else {
212 | 							// if index out of bound of the score table 
213 | 							h[m] = MINUS_INF2;
214 | 							f[m] = MINUS_INF2;
215 | 						}
216 | 						
217 | 					}
218 | 
219 | 					for (m=2;m<9;m++) {
220 | 						p[m] = h[m-1];
221 | 					}
222 | 
223 | 					// Set boundaries for the current chunk
224 | 					chunk_start = (max(0, (packed_ref_idx*packed_len - _cudaBandWidth)))/packed_len;
225 | 					chunk_end = min( packed_query_len-1, ( (packed_ref_idx*packed_len + packed_len -1 + _cudaBandWidth )) /packed_len );
226 | 					packed_ref_literal = packed_ref_batch[packed_ref_batch_idx + packed_ref_idx];
227 | 				}
228 | 					
229 | 				// Compute the current chunk
230 | 				for (y = 0; y < _cudaSliceWidth; y++) {
231 | 					if (active && chunk_start <= packed_query_idx && packed_query_idx <= chunk_end) {
232 | 						
233 | 						packed_query_literal = packed_query_batch[packed_query_batch_idx + packed_query_idx]; 
234 | 						query_idx = packed_query_idx << 3;
235 | 						
236 | 						for (k = 28; k >= 0 && query_idx < query_len; k -= 4) {
237 | 							uint32_t qbase = (packed_query_literal >> k) & 15;	//get a base from query_batch sequence
238 | 							// load intermediate values from global buffers
239 | 							HD = global_buffer_top[warp_num*max_query_len + query_idx];
240 | 							h[0] = HD.x;
241 | 							e = HD.y;
242 | 
243 | 							if (packed_query_idx == chunk_start || packed_query_idx == chunk_end) {
244 | 								#pragma unroll 8
245 | 								for (l = 28, m = 1; m < 9; l -= 4, m++) {
246 | 									CORE_COMPUTE_BOUNDARY();
247 | 								}
248 | 							} else {
249 | 								#pragma unroll 8
250 | 								for (l = 28, m = 1; m < 9; l -= 4, m++) {
251 | 									CORE_COMPUTE();
252 | 								}
253 | 							}
254 | 							
255 | 							// write intermediate values to global buffers
256 | 							HD.x = h[m-1];
257 | 							HD.y = e;
258 | 							global_buffer_top[warp_num*max_query_len + query_idx] = HD;
259 | 
260 | 							query_idx++;
261 | 
262 | 						}
263 | 
264 | 					}
265 | 					
266 | 
267 | 					packed_query_idx++;
268 | 					
269 | 				}
270 | 				
271 | 				// write intermediate values to global buffers
272 | 				if (active) {	
273 | 					for (m = 1; m < 9; m++) {
274 | 						if ( ref_idx + m-1 < ref_len) {
275 | 							HD.x = h[m];
276 | 							HD.y = f[m];
277 | 							global_buffer_left[warp_num*max_query_len + ref_idx + m-1] = HD;
278 | 						}
279 | 					}
280 | 					global_buffer_topleft[warp_num*max_query_len + packed_ref_idx] = p[1];
281 | 				}
282 | 				
283 | 				finished_blocks+=warp_len;
284 | 			}
285 | 
286 | 			__syncwarp();
287 | 
288 | 			last_diag = (i+_cudaSliceWidth)<<3;
289 | 			prev_max_score = query_len+ref_len-1;
290 | 
291 | 			/* Termination Condition & Score Update */
292 | 			if (!terminated) {
293 | 				for (diag_idx = i<<3; diag_idx < last_diag; diag_idx++) {
294 | 					if (diag_idx <prev_max_score) {
295 | 						m = diag_idx&(total_shm-1);
296 | 						temp = __reduce_max_sync(same_threads, antidiag_max[(m<<5)+real_warp_id]);
297 | 						if ((temp>>16) > max_score) {				
298 | 							max_score = temp>>16;
299 | 							max_ref_idx = (temp&65535);
300 | 							max_query_idx = diag_idx-max_ref_idx; 
301 | 						} else if ( (temp&65535) >= max_ref_idx && (diag_idx-(temp&65535)) >= max_query_idx) {
302 | 							int tl =  (temp&65535) - max_ref_idx, ql = (diag_idx-(temp&65535)) - max_query_idx, l;
303 | 							l = tl > ql? tl - ql : ql - tl;
304 | 							if (_cudaZThreshold >= 0 && max_score - (temp>>16) > _cudaZThreshold + l*_cudaGapExtend) {
305 | 								// Termination condition is met
306 | 								terminated = true;
307 | 								break;
308 | 							}
309 | 						}
310 | 						// reset shared memory buffer for next slice
311 | 						antidiag_max[(m<<5)+real_warp_id]=INT_MIN;
312 | 					}
313 | 				}
314 | 			}
315 | 			
316 | 			__syncwarp();
317 | 
318 | 			// If job is finished
319 | 			if (terminated) {
320 | 				total_anti_diags = i; // set the total amount of diagonals as the current diagonal (to indicate that the job has finished)	
321 | 				if (warp_id==0) shared_job[(warp_num&3)] = total_anti_diags; //update this to shared memory as well (this will be used in Subwarp Rejoining as an indicator that the subwarp's job is done)
322 | 			}
323 | 			
324 | 			// Update the max score and its index to shared memory (used in Subwarp Rejoining)
325 | 			if (warp_id==1) shared_job[20+(warp_num&3)] = max_score;
326 | 			else if (warp_id==2) shared_job[24+(warp_num&3)] = (max_ref_idx<<16) + max_query_idx;
327 |  
328 | 			__syncwarp();
329 | 
330 | 			i += _cudaSliceWidth;
331 | 
332 | 			/*Job wrap-up*/
333 | 			// If the job is done (either due to (1) meeting the termination condition (2) all the diagonals have been computed)
334 | 			if (i >= total_anti_diags) {
335 | 				
336 | 				// In the case of (2), check the termination condition & score update for the last diagonal block
337 | 				if (!terminated) {
338 | 					diag_idx = (i*packed_len)&(total_shm-1);
339 | 					for (k = i*packed_len, m = diag_idx; m < diag_idx+packed_len; m++, k++) {
340 | 						temp = __reduce_max_sync(same_threads, antidiag_max[(m<<5)+real_warp_id]);
341 | 						if ((temp>>16) > max_score) {				
342 | 							max_score = temp>>16;
343 | 							max_ref_idx = (temp&65535);
344 | 							max_query_idx = k-max_ref_idx; 
345 | 						} else if ( (temp&65535) >= max_ref_idx && (k-(temp&65535)) >= max_query_idx) {
346 | 							int tl =  (temp&65535) - max_ref_idx, ql = (k-(temp&65535)) - max_query_idx, l;
347 | 							l = tl > ql? tl - ql : ql - tl;
348 | 							if (_cudaZThreshold >= 0 && max_score - (temp>>16) > _cudaZThreshold + l*_cudaGapExtend) {
349 | 								// Termination condition is met
350 | 								terminated = true;
351 | 								break;
352 | 							}
353 | 						}
354 | 						antidiag_max[(m<<5)+real_warp_id]=INT_MIN;
355 | 					}
356 | 				}
357 | 				
358 | 				// Spill the results to GPU memory to be later moved to the CPU
359 | 				if (warp_id==0) {
360 | 					device_res->aln_score[ub_idx] = max_score;//copy the max score to the output array in the GPU mem
361 | 					device_res->query_batch_end[ub_idx] = max_query_idx;//copy the end position on query_batch sequence to the output array in the GPU mem
362 | 					device_res->target_batch_end[ub_idx] = max_ref_idx;//copy the end position on target_batch sequence to the output array in the GPU mem
363 | 				}
364 | 
365 | 				/*Subwarp Rejoining*/
366 | 				// The subwarp that has no job looks for new jobs by iterating over other subwarp's job
367 | 				for (m = 0; m < (32/const_warp_len); m++) {
368 | 					// if the selected job still has remainig diagonals
369 | 					if (shared_job[m] > i) { // possible because all subwarps sync after each diagonal block is finished
370 | 						// read the selected job's info
371 | 						total_anti_diags = shared_job[m];
372 | 						warp_num = ((warp_num>>2)<<2)+m;
373 | 						ub_idx = shared_job[16+m];
374 | 
375 | 						packed_ref_batch_idx = shared_job[4+m];
376 | 						packed_query_batch_idx = shared_job[8+m];
377 | 						ref_len = shared_job[12+m];
378 | 						query_len = ref_len&65535;
379 | 						ref_len = ref_len>>16;
380 | 						packed_query_len = (query_len >> 3) + (query_len & 7 ? 1 : 0);
381 | 						packed_ref_len = (ref_len >> 3) + (ref_len & 7 ? 1 : 0);
382 | 						
383 | 						max_score = shared_job[20+m];
384 | 						max_ref_idx = shared_job[24+m];
385 | 						max_query_idx = max_ref_idx&65535;
386 | 						max_ref_idx = max_ref_idx>>16;
387 | 						
388 | 						// reset the flag
389 | 						terminated = false;
390 | 
391 | 						// reset shared memory buffer
392 | 						for (m = 0; m < total_shm; m++) {
393 | 							antidiag_max[(m<<5)+real_warp_id]=INT_MIN;
394 | 						}
395 | 						
396 | 						break;
397 | 					}
398 | 				}
399 | 
400 | 			}
401 | 
402 | 			__syncwarp();
403 | 			
404 | 			/*Subwarp Rejoining*/
405 | 			//Set the mask, warp length and thread id within the warp 
406 | 			same_threads = __match_any_sync(__activemask(), warp_num);
407 | 			warp_len = __popc(same_threads);
408 | 			warp_id = __popc((((0xffffffff) << (threadIdx.x % 32))&same_threads))-1;
409 | 			
410 | 			__syncwarp();
411 | 
412 | 		}
413 | 
414 | 		__syncwarp();
415 | 		/*Subwarp Rejoining*/
416 | 		//Reset subwarp and job related values for the next iteration
417 | 		warp_len = const_warp_len;
418 | 		warp_num = tid / warp_len;
419 | 		warp_id = tid % const_warp_len;
420 | 		ub_idx = shared_job[16+(warp_num&3)];
421 | 
422 | 		__syncwarp();
423 | 
424 | 
425 | 
426 | 	}
427 | 	
428 | 	return;
429 | 
430 | 
431 | }
432 | 
433 | 
434 | __global__ void agatha_sort(uint32_t *packed_query_batch, uint32_t *packed_ref_batch,  uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, int n_tasks, uint32_t max_query_len, short2 *global_buffer_top)
435 | {
436 | 
437 |     const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
438 | 
439 | 	uint32_t query_len, ref_len, packed_query_len, packed_ref_len;
440 | 
441 | 	short2* global_ub_idx = (short2*)(global_buffer_top+max_query_len*(blockDim.x/8)*gridDim.x*3);
442 | 
443 | 	if (tid < n_tasks) {
444 | 
445 | 		query_len = query_batch_lens[tid];
446 | 		ref_len = target_batch_lens[tid];
447 | 		packed_query_len = (query_len >> 3) + (query_len & 7 ? 1 : 0);//number of 32-bit words holding query_batch sequence
448 | 		packed_ref_len = (ref_len >> 3) + (ref_len & 7 ? 1 : 0);
449 | 
450 | 		global_ub_idx[tid] = make_short2((packed_ref_len + packed_query_len-1), tid);
451 | 
452 | 
453 | 	}
454 | 	
455 | 	return;
456 | 
457 | 
458 | }
459 | #endif
460 | 


--------------------------------------------------------------------------------
/AGAThA/src/kernels/pack_rc_seqs.h:
--------------------------------------------------------------------------------
  1 | #ifndef __KERNEL_SEQPAK__
  2 | #define __KERNEL_SEQPAK__
  3 | 
  4 | 
  5 | #define A_PAK ('A'&0x0F)
  6 | #define C_PAK ('C'&0x0F)
  7 | #define G_PAK ('G'&0x0F)
  8 | #define T_PAK ('T'&0x0F)
  9 | //#define N_PAK ('N'&0x0F)
 10 | 
 11 | 
 12 | 
 13 | __global__ void gasal_pack_kernel(uint32_t* unpacked_query_batch, uint32_t* unpacked_target_batch, uint32_t *packed_query_batch, uint32_t* packed_target_batch, int query_batch_tasks_per_thread, int target_batch_tasks_per_thread, uint32_t total_query_batch_regs, uint32_t total_target_batch_regs) \
 14 | {
 15 | 
 16 | 	int32_t i;
 17 | 	const int32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 18 | 	uint32_t n_threads = gridDim.x * blockDim.x;
 19 | 	for (i = 0; i < query_batch_tasks_per_thread &&  (((i*n_threads)<<1) + (tid<<1) < total_query_batch_regs); ++i) {
 20 | 		uint32_t *query_addr = &(unpacked_query_batch[(i*n_threads)<<1]);
 21 | 		uint32_t reg1 = query_addr[(tid << 1)]; //load 4 bases of the query sequence from global memory
 22 | 		uint32_t reg2 = query_addr[(tid << 1) + 1]; //load  another 4 bases
 23 | 		uint32_t packed_reg = 0;
 24 | 		packed_reg |= (reg1 & 15) << 28;        // ---
 25 | 		packed_reg |= ((reg1 >> 8) & 15) << 24; //    |
 26 | 		packed_reg |= ((reg1 >> 16) & 15) << 20;//    |
 27 | 		packed_reg |= ((reg1 >> 24) & 15) << 16;//    |
 28 | 		packed_reg |= (reg2 & 15) << 12;        //     > pack sequence
 29 | 		packed_reg |= ((reg2 >> 8) & 15) << 8;  //    |
 30 | 		packed_reg |= ((reg2 >> 16) & 15) << 4; //    |
 31 | 		packed_reg |= ((reg2 >> 24) & 15);      //----
 32 | 		uint32_t *packed_query_addr = &(packed_query_batch[i*n_threads]);
 33 | 		packed_query_addr[tid] = packed_reg; //write 8 bases of packed query sequence to global memory
 34 | 	}
 35 | 
 36 | 	for (i = 0; i < target_batch_tasks_per_thread &&  (((i*n_threads)<<1) + (tid<<1)) < total_target_batch_regs; ++i) {
 37 | 		uint32_t *target_addr = &(unpacked_target_batch[(i * n_threads)<<1]);
 38 | 		uint32_t reg1 = target_addr[(tid << 1)]; //load 4 bases of the target sequence from global memory
 39 | 		uint32_t reg2 = target_addr[(tid << 1) + 1]; //load  another 4 bases
 40 | 		uint32_t packed_reg = 0;
 41 | 		packed_reg |= (reg1 & 15) << 28;        // ---
 42 | 		packed_reg |= ((reg1 >> 8) & 15) << 24; //    |
 43 | 		packed_reg |= ((reg1 >> 16) & 15) << 20;//    |
 44 | 		packed_reg |= ((reg1 >> 24) & 15) << 16;//    |
 45 | 		packed_reg |= (reg2 & 15) << 12;        //     > pack sequence
 46 | 		packed_reg |= ((reg2 >> 8) & 15) << 8;  //    |
 47 | 		packed_reg |= ((reg2 >> 16) & 15) << 4; //    |
 48 | 		packed_reg |= ((reg2 >> 24) & 15);      //----
 49 | 		uint32_t *packed_target_addr = &(packed_target_batch[i * n_threads]);
 50 | 		packed_target_addr[tid] = packed_reg; //write 8 bases of packed target sequence to global memory
 51 | 	}
 52 | 
 53 | }
 54 | 
 55 | 
 56 | __global__ void	gasal_reversecomplement_kernel(uint32_t *packed_query_batch,uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, uint8_t *query_op, uint8_t *target_op, uint32_t  n_tasks)
 57 | {
 58 | 
 59 | 	const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 60 | 
 61 | 	if (tid >= n_tasks) return;
 62 | 	if (query_op[tid] == 0 && target_op[tid] == 0) return;		// if there's nothing to do (op=0, meaning sequence is Forward Natural), just exit the kernel ASAP. 
 63 | 
 64 | 
 65 | 	uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence
 66 | 	uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence
 67 | 	uint32_t read_len = query_batch_lens[tid];
 68 | 	uint32_t ref_len = target_batch_lens[tid];
 69 | 	uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch
 70 | 	uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch
 71 | 
 72 | 	uint32_t query_batch_regs_to_swap = (query_batch_regs >> 1) + (query_batch_regs & 1); // that's (query_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence)
 73 | 	uint32_t target_batch_regs_to_swap = (target_batch_regs >> 1) + (target_batch_regs & 1); // that's (target_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence)
 74 | 
 75 | 
 76 | 	// variables used dependent on target and query: 
 77 | 
 78 | 	uint8_t *op = NULL;
 79 | 	uint32_t *packed_batch = NULL;
 80 | 	uint32_t *batch_regs = NULL;
 81 | 	uint32_t *batch_regs_to_swap = NULL;
 82 | 	uint32_t *packed_batch_idx = NULL;
 83 | 
 84 | 	// avoid useless code duplicate thanks to pointers to route the data flow where it should be, twice.
 85 | 	// The kernel is already generic. Later on this can be used to split the kernel into two using templates...
 86 | 	#pragma unroll 2
 87 | 	for (int p = QUERY; p <= TARGET; p++)
 88 | 	{
 89 | 		switch(p)
 90 | 		{
 91 | 			case QUERY:
 92 | 				op = query_op;
 93 | 				packed_batch = packed_query_batch;
 94 | 				batch_regs = &query_batch_regs;
 95 | 				batch_regs_to_swap = &query_batch_regs_to_swap;
 96 | 				packed_batch_idx = &packed_query_batch_idx;
 97 | 				break;
 98 | 			case TARGET:
 99 | 				op = target_op;
100 | 				packed_batch = packed_target_batch;
101 | 				batch_regs = &target_batch_regs;
102 | 				batch_regs_to_swap = &target_batch_regs_to_swap;
103 | 				packed_batch_idx = &packed_target_batch_idx;
104 | 				break;
105 | 			default:
106 | 			break;
107 | 		}
108 | 
109 | 		if (*(op + tid) & 0x01) // reverse
110 | 		{
111 | 			// deal with N's : read last word, find how many N's, store that number as offset, and pad with that many for the last 
112 | 			uint8_t nbr_N = 0;
113 | 			for (int j = 0; j < 32; j = j + 4)
114 | 			{
115 | 				nbr_N += (((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1) & (0x0F << j)) >> j) == N_CODE);
116 | 			}
117 | 			
118 | 			//printf("KERNEL_DEBUG: nbr_N=%d\n", nbr_N);
119 | 
120 | 
121 | 			nbr_N = nbr_N << 2; // we operate on nibbles so we will need to do our shifts 4 bits by 4 bits, so 4*nbr_N
122 | 
123 | 			for (uint32_t i = 0; i < *(batch_regs_to_swap); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if.
124 | 			{
125 | 				/* This  is the current operation flow:\
126 | 					- Read the first 32-bits word on HEAD
127 | 					- Combine the reads of 2 last 32-bits words on tail to create the 32-bits word WITHOUT N's 
128 | 					- Swap them 
129 | 					- Write them at the correct places. Remember we're building 32-bits words across two 32-bits words on tail. 
130 | 					So we have to take care of which bits are to be written on tail, too.
131 | 
132 | 				You progress through both heads and tails that way, until you reach the center of the sequence. 
133 | 				When you reach it, you actually don't write one of the words to avoid overwrite.
134 | 				*/
135 | 				uint32_t rpac_1 = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head
136 | 				uint32_t rpac_2 = ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) << (32-nbr_N)) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) >> nbr_N);
137 | 
138 | 
139 | 				uint32_t reverse_rpac_1 = 0;
140 | 				uint32_t reverse_rpac_2 = 0;
141 | 
142 | 
143 | 				#pragma unroll 8
144 | 				for(int k = 28; k >= 0; k = k - 4)		// reverse 32-bits word... is pragma-unrolled. 
145 | 				{
146 | 					reverse_rpac_1 |= ((rpac_1 & (0x0F << k)) >> (k)) << (28-k);
147 | 					reverse_rpac_2 |= ((rpac_2 & (0x0F << k)) >> (k)) << (28-k);
148 | 				}
149 | 				// last swap operated manually, because of its irregular size (32 - 4*nbr_N bits, hence 8 - nbr_N nibbles)
150 | 
151 | 
152 | 				uint32_t to_queue_1 = (reverse_rpac_1 << nbr_N) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) & ((1<<nbr_N) - 1));
153 | 				uint32_t to_queue_2 = ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) & (0xFFFFFFFF - ((1<<nbr_N) - 1))) | (reverse_rpac_1 >> (32-nbr_N));
154 | 
155 | 			
156 | 				//printf("KERNEL DEBUG: rpac_1 Word before reverse: %x, after: %x, split into %x + %x \n", rpac_1, reverse_rpac_1, to_queue_2, to_queue_1 );
157 | 				//printf("KERNEL DEBUG: rpac_2 Word before reverse: %x, after: %x\n", rpac_2, reverse_rpac_2 );
158 | 
159 | 
160 | 				*(packed_batch + *(packed_batch_idx) + i) = reverse_rpac_2;
161 | 				(*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) = to_queue_1;
162 | 				if (i!=*(batch_regs_to_swap)-1)
163 | 					(*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) = to_queue_2;
164 | 
165 | 
166 | 			} // end for
167 | 		} // end if(reverse)
168 | 
169 | 		if (*(op+tid) & 0x02) // complement
170 | 		{
171 | 			for (uint32_t i = 0; i < *(batch_regs); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if.
172 | 			{
173 | 				uint32_t rpac = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head
174 | 				uint32_t nucleotide = 0;
175 | 
176 | 				#pragma unroll 8
177 | 				for(int k = 28; k >= 0; k = k - 4)		// complement 32-bits word... is pragma-unrolled. 
178 | 				{
179 | 					nucleotide = (rpac & (0x0F << k)) >> (k);
180 | 					switch(nucleotide)
181 | 					{
182 | 						case A_PAK:
183 | 							nucleotide = T_PAK;
184 | 							break;
185 | 						case C_PAK:
186 | 							nucleotide = G_PAK;
187 | 							break;
188 | 						case T_PAK:
189 | 							nucleotide = A_PAK;
190 | 							break;
191 | 						case G_PAK:
192 | 							nucleotide = C_PAK;
193 | 							break;
194 | 						default:
195 | 							break;
196 | 					}
197 | 					rpac = (rpac & (0xFFFFFFFF - (0x0F << k))) | nucleotide << k;
198 | 				}
199 | 
200 | 				//printf("KERNEL DEBUG: Word read : %x, after complement: %x\n", *(packed_batch + *(packed_batch_idx) + i), rpac);
201 | 
202 | 				*(packed_batch + *(packed_batch_idx) + i) = rpac;
203 | 
204 | 			} // end for
205 | 		} // end if(complement)
206 | 
207 | 
208 | 
209 | 	}
210 | 
211 | 	return;
212 | }
213 | #endif


--------------------------------------------------------------------------------
/AGAThA/src/res.cpp:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | 
  3 | #include "args_parser.h"
  4 | 
  5 | #include "res.h"
  6 | 
  7 | 
  8 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params)
  9 | {
 10 | 	cudaError_t err;
 11 | 	gasal_res_t *res = NULL;
 12 | 
 13 | 
 14 | 	res = (gasal_res_t *)malloc(sizeof(gasal_res_t));
 15 | 
 16 | 	CHECKCUDAERROR(cudaHostAlloc(&(res->aln_score), max_n_alns * sizeof(int32_t),cudaHostAllocDefault));
 17 | 	
 18 | 	
 19 | 	if(res ==NULL)
 20 | 	{
 21 | 		fprintf(stderr,  "Malloc error on res host ");
 22 | 		exit(1);
 23 | 	}
 24 | 
 25 | 	CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 26 | 	CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 27 | 	res->query_batch_start = NULL;
 28 | 	res->target_batch_start = NULL;
 29 | 
 30 | 	return res;
 31 | }
 32 | 
 33 | 
 34 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy)
 35 | {
 36 | 	cudaError_t err;
 37 | 
 38 | 
 39 | 	
 40 |     // create class storage on device and copy top level class
 41 |     gasal_res_t *d_c;
 42 |     CHECKCUDAERROR(cudaMalloc((void **)&d_c, sizeof(gasal_res_t)));
 43 | 	//    CHECKCUDAERROR(cudaMemcpy(d_c, res, sizeof(gasal_res_t), cudaMemcpyHostToDevice));
 44 | 
 45 | 
 46 | 
 47 |     // copy pointer to allocated device storage to device class
 48 |     CHECKCUDAERROR(cudaMemcpy(&(d_c->aln_score), &(device_cpy->aln_score), sizeof(int32_t*), cudaMemcpyHostToDevice));
 49 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_start), &(device_cpy->query_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice));
 50 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_start), &(device_cpy->target_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice));
 51 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_end), &(device_cpy->query_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice));
 52 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_end), &(device_cpy->target_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice));
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 	return d_c;
 59 | }
 60 | 
 61 | 
 62 | 
 63 | 
 64 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params)
 65 | {
 66 | 	cudaError_t err;
 67 | 	gasal_res_t *res;
 68 | 
 69 | 	res = (gasal_res_t *)malloc(sizeof(gasal_res_t));
 70 | 
 71 | 	CHECKCUDAERROR(cudaMalloc(&(res->aln_score), max_n_alns * sizeof(int32_t)));
 72 | 
 73 | 	CHECKCUDAERROR(cudaMalloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t)));
 74 | 	CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t)));
 75 | 
 76 | 	res->query_batch_start = NULL;
 77 | 	res->target_batch_start = NULL;
 78 | 
 79 | 
 80 | 
 81 | 	return res;
 82 | }
 83 | 
 84 | // TODO : make 2 destroys for host and device
 85 | void gasal_res_destroy_host(gasal_res_t *res) 
 86 | {
 87 | 	cudaError_t err;
 88 | 	if (res == NULL)
 89 | 		return;
 90 | 
 91 | 
 92 | 	if (res->aln_score != NULL) CHECKCUDAERROR(cudaFreeHost(res->aln_score));
 93 | 	if (res->query_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_start));
 94 | 	if (res->target_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_start));
 95 | 	if (res->query_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_end));
 96 | 	if (res->target_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_end));
 97 | 	//if (res->n_cigar_ops != NULL) CHECKCUDAERROR(cudaFreeHost(res->n_cigar_ops));
 98 | 
 99 | 	free(res);
100 | }
101 | 
102 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy) 
103 | {
104 | 	cudaError_t err;
105 | 	if (device_cpy == NULL || device_res == NULL)
106 | 		return;
107 | 
108 | 	if (device_cpy->aln_score != NULL) CHECKCUDAERROR(cudaFree(device_cpy->aln_score));
109 | 	if (device_cpy->query_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_start));
110 | 	if (device_cpy->target_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_start));
111 | 	if (device_cpy->query_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_end));
112 | 	if (device_cpy->target_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_end));
113 | 	//if (device_cpy->cigar != NULL) CHECKCUDAERROR(cudaFree(device_cpy->cigar));
114 | 	
115 | 
116 | 	CHECKCUDAERROR(cudaFree(device_res));
117 | 	
118 | 	free(device_cpy);
119 | }
120 | 


--------------------------------------------------------------------------------
/AGAThA/src/res.h:
--------------------------------------------------------------------------------
 1 | #ifndef __RES_H__
 2 | #define __RES_H__
 3 | 
 4 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params);
 5 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy);
 6 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params);
 7 | 
 8 | void gasal_res_destroy_host(gasal_res_t *res);
 9 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy);
10 | 
11 | 
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/AGAThA/test_prog/Makefile:
--------------------------------------------------------------------------------
 1 | #CUDA_LD_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib
 2 | CUDA_LD_LIBRARY=/usr/local/cuda/lib64
 3 | ANALYSIS_FILENAME=analysis
 4 | # prefix1 can be optirun in case you need to run it from an optimus-enabled laptop.
 5 | PREFIX1=
 6 | #i prefix2 can be nvprof. use preferably the following : nvprof --profile-api-trace none -s -f -o /tmp/.nvprof/$(ANALYSIS_FILENAME).nvprof
 7 | PREFIX2=nvprof --profile-api-trace none -s -f -o /tmp/.nvprof/$(ANALYSIS_FILENAME).nvprof
 8 | #suffix1 and 2 can be an output file.
 9 | SUFFIX1=> golden.log
10 | SUFFIX2=> out.log
11 | 
12 | PRGM=manual
13 |  
14 | OPTARGS1=-p -y local
15 | OPTARGS2=-p -y local
16 | 
17 | 
18 | FILES_HUMAN600=reads_600_human_10M.fasta ref_600_human_10M.fasta
19 | FILES_HUMAN300=reads_300_human_10M.fasta ref_300_human_10M.fasta
20 | FILES_HUMAN150=reads_150_human_10M.fasta ref_150_human_10M.fasta
21 | FILES_20K=query_batch.fasta target_batch.fasta
22 | FILES_262K=reads_150.fasta ref_150.fasta
23 | FILES_SHORT=short_query_batch.fasta short_target_batch.fasta
24 | 
25 | .cpp.o:
26 | 	g++ -std=c++11 -g -c -O3 -Wall -Werror -fopenmp -I ../include  -o test_prog.o test_prog.cpp -lcudart
27 | 
28 | all: clean manual #test_prog.out
29 | 
30 | 
31 | manual: test_prog.o ../obj/args_parser.cppo ../obj/host_batch.cppo ../obj/ctors.cppo ../obj/interfaces.cppo ../obj/res.cppo ../obj/gasal_align.cuo 
32 | 	g++ test_prog.o ../obj/args_parser.cppo ../obj/host_batch.cppo ../obj/ctors.cppo ../obj/interfaces.cppo ../obj/res.cppo ../obj/gasal_align.cuo -L../lib  -o manual -g -fopenmp -std=c++11 -Wall -Wno-sign-compare -O3 -L/usr/local/cuda/lib64 -lcudart -Iinclude -I/usr/local/cuda/include -lgasal
33 | 
34 | 
35 | test_prog.out: test_prog.o
36 | 	g++ -std=c++11 -O3 test_prog.cpp -o test_prog.out -L$(CUDA_LD_LIBRARY) -L../lib -fopenmp -lcudart -lgasal 
37 | 	#g++ -std=c++11 -O3 -c test_prog.cpp -o test_prog.out -L$(CUDA_LD_LIBRARY) -L../lib -fopenmp -lcuda -lcudart -lgasal test_prog.o 
38 | 
39 | clean:
40 | 	rm -f -r *~ *.exe *.o *.out manual
41 | 	
42 | test_prog.o: Timer.h
43 |  
44 | 
45 | human150: all
46 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN150) $(SUFFIX1)
47 | 
48 | human150-2: all
49 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN150) $(SUFFIX2)
50 | 
51 | human300: all
52 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN300) $(SUFFIX1)
53 | 
54 | human300-2: all
55 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN300) $(SUFFIX2)
56 | 
57 | human600: all
58 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN600) $(SUFFIX1)
59 | 
60 | human600-2: all
61 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN600) $(SUFFIX2)
62 | 
63 | 
64 | run: all
65 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_SHORT) $(SUFFIX1)
66 | 
67 | run2: all
68 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_SHORT) $(SUFFIX2)
69 | 
70 | 
71 | fullrun: all
72 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1)
73 | 
74 | fullrun2: all
75 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_20K) $(SUFFIX2)
76 | 
77 | 
78 | 262k: all
79 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_262K) $(SUFFIX1)
80 | 
81 | 262k2: all
82 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_262K) $(SUFFIX2)
83 | 
84 | 
85 | 
86 | 
87 | 
88 | cuda-memcheck: all
89 | 	cuda-memcheck ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1)
90 | 
91 | cuda-gdb: all
92 | 	cuda-gdb --args ./test_prog.out -p -y local query_batch.fasta target_batch.fasta
93 | 
94 | valgrind: all
95 | 	valgrind ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta
96 | 
97 | gdb: all
98 | 	gdb --args ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta
99 | 


--------------------------------------------------------------------------------
/AGAThA/test_prog/README.md:
--------------------------------------------------------------------------------
1 | A test program to run the AGAThA kernel. 
2 | TBA.


--------------------------------------------------------------------------------
/AGAThA/test_prog/Timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_H
 2 | #define TIMER_H
 3 | 
 4 | #include<string.h>
 5 | #include<stdlib.h>
 6 | #include<assert.h>
 7 | #include<sys/time.h>
 8 | 
 9 | class Timer
10 | {
11 |     private:
12 |         struct timeval startTime;
13 |         struct timeval stopTime;
14 |         double elapsedTime;
15 |         std::string name;
16 | 
17 |     public:
18 |         Timer(std::string n) { name = n; elapsedTime = 0.0;}
19 | 	Timer() { name = ""; elapsedTime = 0.0;}
20 |         void Clear() { elapsedTime = 0.0; }
21 |         void Start() { gettimeofday(&(startTime), NULL); }
22 |         void Restart()
23 |         {
24 |             elapsedTime = 0.0;
25 |             gettimeofday(&(startTime), NULL);
26 |         }
27 | 
28 |         void Pause()
29 |         {
30 |             gettimeofday(&(stopTime), NULL);
31 | 
32 |             elapsedTime +=  ( (stopTime).tv_sec  - (startTime).tv_sec) * 1000.0;      // sec to ms
33 |             elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0;   // us to ms
34 |         }
35 | 
36 |         void Stop()
37 |         {
38 |             gettimeofday(&(stopTime), NULL);
39 | 
40 |             elapsedTime =  ( (stopTime).tv_sec  - (startTime).tv_sec) * 1000.0;      // sec to ms
41 |             elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0;   // us to ms
42 |         }
43 | 
44 |         void Print()
45 |         {
46 |             std::cout << name << " : " <<  elapsedTime << " msec"   << std::endl;
47 |         }
48 | 
49 |         double GetTime() { return elapsedTime;}
50 | 
51 | };
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/AGAThA/test_prog/test_prog.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #include "../include/gasal_header.h"
  4 | #include <vector>
  5 | #include <unistd.h>
  6 | #include <math.h>
  7 | #include <omp.h>
  8 | #include "Timer.h"
  9 | 
 10 | 
 11 | #define NB_STREAMS 2
 12 | 
 13 | 
 14 | //#define DEBUG
 15 | 
 16 | #define MAX(a,b) (a>b ? a : b)
 17 | 
 18 | //#define GPU_SELECT 0
 19 | 
 20 | 
 21 | int main(int argc, char **argv) {
 22 | 	Timer local_time;
 23 | 	Timer malloc_time;
 24 | 	Timer free_time;
 25 | 	cudaDeviceSynchronize();
 26 | 	Timer total_time;
 27 | 	total_time.Start();
 28 | 	Timer load_time;
 29 | 	load_time.Start();
 30 | 
 31 | 	//gasal_set_device(GPU_SELECT);
 32 | 
 33 | 	Parameters *args;
 34 | 	args = new Parameters(argc, argv);
 35 | 	args->parse();
 36 | 	//args->print();
 37 | 
 38 | 	int print_out = args->print_out;
 39 | 	int n_threads = args->n_threads;
 40 | 
 41 | 	//--------------copy substitution scores to GPU--------------------
 42 | 	gasal_subst_scores sub_scores;
 43 | 
 44 | 	sub_scores.match = args->sa;
 45 | 	sub_scores.mismatch = args->sb;
 46 | 	sub_scores.gap_open = args->gapo;
 47 | 	sub_scores.gap_extend = args->gape;
 48 | 	sub_scores.slice_width = args->slice_width;
 49 | 	sub_scores.z_threshold = args->z_threshold;
 50 | 	sub_scores.band_width = args->band_width;
 51 | 
 52 | 	gasal_copy_subst_scores(&sub_scores);
 53 | 
 54 | 	//-------------------------------------------------------------------
 55 | 
 56 | 
 57 | 	std::vector<std::string> query_seqs;
 58 | 	std::vector<std::string> target_seqs;
 59 | 	std::vector<std::string> query_headers;
 60 | 	std::vector<std::string> target_headers;
 61 | 	std::string query_batch_line, target_batch_line;
 62 | 
 63 | 	int total_seqs = 0;
 64 | 	uint32_t maximum_sequence_length = 0;
 65 | 	uint32_t target_seqs_len = 0;
 66 | 	uint32_t query_seqs_len = 0;
 67 | 	//std::cerr << "Loading files...." << std::endl;
 68 | 
 69 | 	/*
 70 | 		Reads FASTA files and fill the corresponding buffers.
 71 | 		FASTA files contain sequences that are usually on separate lines.
 72 | 		The file reader detects a '>' then concatenates all the following lines into one sequence, until the next '>' or EOF.
 73 | 		See more about FASTA format : https://en.wikipedia.org/wiki/FASTA_format
 74 | 	*/
 75 | 	
 76 | 	int seq_begin=0;
 77 | 
 78 | 	std::vector<uint8_t> query_mod;
 79 | 	std::vector<uint8_t> target_mod;
 80 | 	std::vector<uint32_t> query_id;
 81 | 	std::vector<uint32_t> target_id;
 82 | 
 83 | 	char line_starts[5] = "></+";
 84 | 	/* The information of reverse-complementing is simulated by changing the first character of the sequence.
 85 | 	 * This is not explicitly FASTA-compliant, although regular FASTA files will simply be interpreted as Forward-Natural direction.
 86 | 	 * From the header of every sequence:
 87 | 	 * - '>' translates to 0b00 (0) = Forward, natural
 88 | 	 * - '<' translates to 0b01 (1) = Reverse, natural
 89 | 	 * - '/' translates to 0b10 (2) = Forward, complemented
 90 | 	 * - '+' translates to 0b11 (3) = Reverse, complemented
 91 | 	 * No protection is done, so any other number will only have its two first bytes counted as above.	 
 92 | 	 */
 93 | 
 94 | 	while (getline(args->query_batch_fasta, query_batch_line) && getline(args->target_batch_fasta, target_batch_line)) { 
 95 | 
 96 | 		//load sequences from the files
 97 | 		char *q = NULL;
 98 | 		char *t = NULL;
 99 | 		q = strchr(line_starts, (int) (query_batch_line[0]));
100 | 		t = strchr(line_starts, (int) (target_batch_line[0]));
101 | 
102 | 		/*  
103 | 			t and q are pointers to the first occurence of the first read character in the line_starts array.
104 | 			so if I compare the address of these pointers with the address of the pointer to line_start, then...
105 | 			I can get which character was found, so which modifier is required. 
106 | 		*/
107 | 
108 | 		if (q != NULL && t != NULL) {
109 | 			total_seqs++;
110 | 
111 | 			query_mod.push_back((uint8_t) (q-line_starts));
112 | 			query_id.push_back(total_seqs);
113 | 
114 | 			target_mod.push_back((uint8_t)(t-line_starts));
115 | 			target_id.push_back(total_seqs);
116 | 
117 | 			query_headers.push_back(query_batch_line.substr(1));
118 | 			target_headers.push_back(target_batch_line.substr(1));
119 | 
120 | 			if (seq_begin == 2) {
121 | 				// a sequence was already being read. Now it's done, so we should find its length.
122 | 				target_seqs_len += (target_seqs.back()).length();
123 | 				query_seqs_len += (query_seqs.back()).length();
124 | 				maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length);
125 | 				maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length);
126 | 			}
127 | 			seq_begin = 1;
128 | 			
129 | 		} else if (seq_begin == 1) {
130 | 			query_seqs.push_back(query_batch_line);
131 | 			target_seqs.push_back(target_batch_line);
132 | 			seq_begin=2;
133 | 		} else if (seq_begin == 2) {
134 | 			query_seqs.back() += query_batch_line;
135 | 			target_seqs.back() += target_batch_line;
136 | 		} else { // should never happen but always put an else, for safety...
137 | 			seq_begin = 0;
138 | 			std::cerr << "Batch1 and target_batch files should be fasta having same number of sequences" << std::endl;
139 | 			exit(EXIT_FAILURE);
140 | 		}
141 | 	}
142 | 
143 | 
144 | 
145 | 	// Check maximum sequence length one more time, to check the last read sequence:
146 | 	target_seqs_len += (target_seqs.back()).length();
147 | 	query_seqs_len += (query_seqs.back()).length();
148 | 	maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length);
149 | 	maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length);
150 | 	int maximum_sequence_length_query = MAX((query_seqs.back()).length(), 0);
151 | 
152 | 	#ifdef DEBUG
153 | 		std::cerr << "[TEST_PROG DEBUG]: ";
154 | 		std::cerr << "Size of read batches are: query=" << query_seqs_len << ", target=" << target_seqs_len << ". maximum_sequence_length=" << maximum_sequence_length << std::endl;
155 | 	#endif
156 | 	load_time.Stop();
157 | 
158 | 	Timer distr_time;
159 | 	distr_time.Start();
160 | 
161 | 	// transforming the _mod into a char* array (to be passed to GASAL, which deals with C types)
162 | 	uint8_t *target_seq_mod = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) );
163 | 	uint8_t *query_seq_mod  = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) );
164 | 	uint32_t *target_seq_id = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) );
165 | 	uint32_t *query_seq_id  = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) );
166 | 
167 | 	for (int i = 0; i < total_seqs; i++)
168 | 	{
169 | 		query_seq_mod[i] = query_mod.at(i);
170 | 		query_seq_id[i] = query_id.at(i);
171 | 	}
172 | 
173 | #ifdef DEBUG
174 | 	std::cerr << "[TEST_PROG DEBUG]: query, mod@id=";
175 | 	for (int i = 0; i < total_seqs; i++)
176 | 	{
177 | 		if ((query_seq_mod[i]) > 0)
178 | 			std::cerr << +(query_seq_mod[i]) << "@" << query_seq_id[i] << "| ";
179 | 	}
180 | 	
181 | 	std::cerr << std::endl;
182 | #endif
183 | 
184 | 	for (int i = 0; i < total_seqs; i++)
185 | 	{
186 | 		target_seq_mod[i] = target_mod.at(i);
187 | 		target_seq_id[i] = target_id.at(i);
188 | 	}
189 | 
190 | 	int *thread_seqs_idx = (int*)malloc(n_threads*sizeof(int));
191 | 	int *thread_n_seqs = (int*)malloc(n_threads*sizeof(int));
192 | 	int *thread_n_batchs = (int*)malloc(n_threads*sizeof(int));
193 | 	double *thread_misc_time = (double*)calloc(n_threads, sizeof(double));
194 | 
195 | 	int thread_batch_size = (int)ceil((double)total_seqs/n_threads);
196 | 	int n_seqs_alloc = 0;
197 | 	for (int i = 0; i < n_threads; i++){//distribute the sequences among the threads equally
198 | 		thread_seqs_idx[i] = n_seqs_alloc;
199 | 		if (n_seqs_alloc + thread_batch_size < total_seqs) thread_n_seqs[i] = thread_batch_size;
200 | 		else thread_n_seqs[i] = total_seqs - n_seqs_alloc;
201 | 		thread_n_batchs[i] = (int)ceil((double)thread_n_seqs[i]/(args->kernel_align_num));
202 | 		n_seqs_alloc += thread_n_seqs[i];
203 | 	}
204 | 	distr_time.Stop();
205 | 
206 | 	//std::cerr << "Processing..." << std::endl;
207 | 
208 | 	Timer process_time;
209 | 	process_time.Start();
210 | 	omp_set_num_threads(n_threads);
211 | 	gasal_gpu_storage_v *gpu_storage_vecs =  (gasal_gpu_storage_v*)calloc(n_threads, sizeof(gasal_gpu_storage_v));
212 | 	for (int z = 0; z < n_threads; z++) {
213 | 		gpu_storage_vecs[z] = gasal_init_gpu_storage_v(NB_STREAMS);// creating NB_STREAMS streams per thread
214 | 
215 | 		/* 
216 | 			About memory sizes:
217 | 			The required memory is the total size of the batch + its padding, divided by the number of streams. 
218 | 			The worst case would be that every sequence has to be padded with 7 'N', since they must have a length multiple of 8.
219 | 			Even though the memory can be dynamically expanded both for Host and Device, it is advised to start with a memory large enough so that these expansions rarely occur (for better performance.)
220 | 			Modifying the factor '1' in front of each size lets you see how GASAL2 expands the memory when needed.
221 | 		*/
222 | 		/*
223 | 		// For exemple, this is exactly the memory needed to allocate to fit all sequences is a single GPU BATCH.
224 | 		gasal_init_streams(&(gpu_storage_vecs[z]), 
225 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 
226 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 
227 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) ,
228 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS))  , 
229 | 							ceil((double)target_seqs.size() / (double)(NB_STREAMS)), // maximum number of alignments is bigger on target than on query side.
230 | 							ceil((double)target_seqs.size() / (double)(NB_STREAMS)), 
231 | 							args);
232 | 		*/		
233 | 		//initializing the streams by allocating the required CPU and GPU memory
234 | 		// note: the calculations of the detailed sizes to allocate could be done on the library side (to hide it from the user's perspective)
235 | 		gasal_init_streams(&(gpu_storage_vecs[z]), (maximum_sequence_length_query + 7) , //TODO: remove maximum_sequence_length_query
236 | 						(maximum_sequence_length + 7) ,
237 | 						maximum_sequence_length,
238 | 						args);
239 | 	}
240 | 	#ifdef DEBUG
241 | 		std::cerr << "[TEST_PROG DEBUG]: ";
242 | 		std::cerr << "size of host_unpack_query is " << (query_seqs_len +7*total_seqs) / (NB_STREAMS) << std::endl ;
243 | 	#endif
244 | 
245 | 	#pragma omp parallel
246 | 	{
247 | 	int n_seqs = thread_n_seqs[omp_get_thread_num()];//number of sequences allocated to this thread
248 | 	int curr_idx = thread_seqs_idx[omp_get_thread_num()];//number of sequences allocated to this thread
249 | 	int seqs_done = 0;
250 | 	int n_batchs_done = 0;
251 | 
252 | 	struct gpu_batch{ //a struct to hold data structures of a stream
253 | 			gasal_gpu_storage_t *gpu_storage; //the struct that holds the GASAL2 data structures
254 | 			int n_seqs_batch;//number of sequences in the batch (<= (target_seqs.size() / NB_STREAMS))
255 | 			int batch_start;//starting index of batch
256 | 	};
257 | 
258 | 	#ifdef DEBUG
259 | 		std::cerr << "[TEST_PROG DEBUG]: ";
260 | 		std::cerr << "Number of gpu_batch in gpu_batch_arr : " << gpu_storage_vecs[omp_get_thread_num()].n << std::endl;
261 | 		std::cerr << "[TEST_PROG DEBUG]: ";
262 | 		std::cerr << "Number of gpu_storage_vecs in a gpu_batch : " << omp_get_thread_num()+1 << std::endl;
263 | 	#endif
264 | 
265 | 	gpu_batch gpu_batch_arr[gpu_storage_vecs[omp_get_thread_num()].n];
266 | 
267 | 	for(int z = 0; z < gpu_storage_vecs[omp_get_thread_num()].n; z++) {
268 | 		gpu_batch_arr[z].gpu_storage = &(gpu_storage_vecs[omp_get_thread_num()].a[z]);
269 | 
270 | 	}
271 | 
272 | 	if (n_seqs > 0) {
273 | 		while (n_batchs_done < thread_n_batchs[omp_get_thread_num()]) { // Loop on streams
274 | 			int gpu_batch_arr_idx = 0;
275 | 			//------------checking the availability of a "free" stream"-----------------
276 | 			while(gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n && (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->is_free != 1) {
277 | 				gpu_batch_arr_idx++;
278 | 			}
279 | 
280 | 			if (seqs_done < n_seqs && gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) {
281 | 				uint32_t query_batch_idx = 0;
282 | 				uint32_t target_batch_idx = 0;
283 | 				int j = 0;
284 | 				//-----------Create a batch of sequences to be aligned on the GPU. The batch contains (target_seqs.size() / NB_STREAMS) number of sequences-----------------------
285 | 
286 | 
287 | 				for (int i = curr_idx; seqs_done < n_seqs && j < (args->kernel_align_num); i++, j++, seqs_done++)
288 | 				{
289 | 
290 | 					gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns++ ;
291 | 
292 | 					if(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns > gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns)
293 | 					{
294 | 						gasal_host_alns_resize(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns * 2, args);
295 | 					}
296 | 
297 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_offsets[j] = query_batch_idx;
298 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_offsets[j] = target_batch_idx;
299 | 
300 | 					/*
301 | 						All the filling is moved on the library size, to take care of the memory size and expansions (when needed).
302 | 						The function gasal_host_batch_fill takes care of how to fill, how much to pad with 'N', and how to deal with memory. 
303 | 						It's the same function for query and target, and you only need to set the final flag to either ; this avoides code duplication.
304 | 						The way the host memory is filled changes the current _idx (it's increased by size, and by the padding). That's why it's returned by the function.
305 | 					*/
306 | 
307 | 					query_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 
308 | 									query_batch_idx, 
309 | 									query_seqs[i].c_str(), 
310 | 									query_seqs[i].size(),
311 | 									QUERY);
312 | 
313 | 					target_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 
314 | 									target_batch_idx, 
315 | 									target_seqs[i].c_str(), 
316 | 									target_seqs[i].size(),
317 | 									TARGET);
318 | 
319 | 					
320 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_lens[j] = query_seqs[i].size();
321 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_lens[j] = target_seqs[i].size();
322 | 
323 | 				}
324 | 
325 | 				#ifdef DEBUG
326 | 					std::cerr << "[TEST_PROG DEBUG]: ";
327 | 					std::cerr << "Stream " << gpu_batch_arr_idx << ": j = " << j << ", seqs_done = " << seqs_done <<", query_batch_idx=" << query_batch_idx << " , target_batch_idx=" << target_batch_idx << std::endl;
328 | 				#endif
329 | 
330 | 				// Here, we fill the operations arrays for the current batch to be processed by the stream
331 | 				gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_seq_mod + seqs_done - j, j, QUERY);
332 | 				gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, target_seq_mod + seqs_done - j, j, TARGET);
333 | 
334 | 
335 | 				gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch = j;
336 | 				uint32_t query_batch_bytes = query_batch_idx;
337 | 				uint32_t target_batch_bytes = target_batch_idx;
338 | 				gpu_batch_arr[gpu_batch_arr_idx].batch_start = curr_idx;
339 | 				curr_idx += (args->kernel_align_num);
340 | 
341 | 				//----------------------------------------------------------------------------------------------------
342 | 				//-----------------calling the GASAL2 non-blocking alignment function---------------------------------
343 | 				local_time.Start();
344 | 				gasal_aln_async(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_batch_bytes, target_batch_bytes, gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch, args);
345 | 				local_time.Stop();
346 | 				gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns = 0;
347 | 				//---------------------------------------------------------------------------------
348 | 
349 | 			}
350 | 
351 | 
352 | 			//-------------------------------print alignment results----------------------------------------
353 | 		
354 | 			gpu_batch_arr_idx = 0;
355 | 			while (gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) {//loop through all the streams and print the results
356 | 																					//of the finished streams.
357 | 				if (gasal_is_aln_async_done(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage) == 0) {
358 | 					int j = 0;
359 | 					if(print_out) {
360 | 						#pragma omp critical
361 | 						for (int i = gpu_batch_arr[gpu_batch_arr_idx].batch_start; j < gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch; i++, j++) {
362 | 
363 | 							std::cout << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->aln_score[j] ;
364 | 							
365 | 							std::cout << "\tquery_batch_end="  << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->query_batch_end[j];
366 | 							std::cout << "\ttarget_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->target_batch_end[j] ;
367 | 						
368 | 							std::cout << std::endl;
369 | 						}
370 | 					}
371 | 					n_batchs_done++;
372 | 				}
373 | 				gpu_batch_arr_idx++;
374 | 			}
375 | 		}
376 | 	}
377 | 
378 | 
379 | 	}
380 | 	for (int z = 0; z < n_threads; z++) {
381 | 		gasal_destroy_streams(&(gpu_storage_vecs[z]), args);
382 | 		gasal_destroy_gpu_storage_v(&(gpu_storage_vecs[z]));
383 | 	}
384 | 	free(gpu_storage_vecs);
385 | 	process_time.Stop();
386 | 	/*
387 | 	string algorithm = al_type;
388 | 	string start_type[2] = {"without_start", "with_start"};
389 | 	al_type += "_";
390 | 	al_type += start_type[start_pos==WITH_START];
391 | 	*/
392 | 	double av_misc_time = 0.0;
393 | 	for (int i = 0; i < n_threads; ++i){
394 | 		av_misc_time += (thread_misc_time[i]/n_threads);
395 | 	}
396 | 	//std::cerr << std::endl << "Done" << std::endl;
397 | 	//fprintf(stderr, "Total execution time (in milliseconds): %.3f\n", total_time.GetTime());
398 | 	delete args; // closes the files
399 | 	//free(args); // closes the files
400 | 	total_time.Stop();
401 | 	/*
402 | 	fprintf(stderr, "load time (in milliseconds): %.3f\n", load_time.GetTime());
403 | 	fprintf(stderr, "distribution time (in milliseconds): %.3f\n", distr_time.GetTime());
404 | 	fprintf(stderr, "process time (with malloc) (in milliseconds): %.3f\n", process_time.GetTime());
405 | 	fprintf(stderr, "malloc time (in milliseconds): %.3f\n", malloc_time.GetTime());
406 | 	fprintf(stderr, "free time (in milliseconds): %.3f\n", free_time.GetTime());
407 | 	fprintf(stderr, "local kernel time (in milliseconds): %.3f\n", local_time.GetTime());
408 | 	fprintf(stderr, "total time (in milliseconds): %.3f\n", total_time.GetTime());
409 | 	*/
410 | }
411 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # [PPoPP24] AGAThA: Fast and Efficient GPU Acceleration of Guided Sequence Alignment for Long Read Mapping [![DOI](https://zenodo.org/badge/725514536.svg)](https://zenodo.org/doi/10.5281/zenodo.10225634)
 2 | 
 3 | ## !!! Important Notice !!! 
 4 | **This repository is currently undergoing a major update**.
 5 | 
 6 | It is strongly recommended to **revisit this repository after a new release is made**. 
 7 | 
 8 | ## Getting Started
 9 | 
10 | ### 1. Environment Setup with Docker
11 | ```properties
12 | cd docker
13 | bash build.sh
14 | bash launch.sh
15 | ```
16 | 
17 | ### 2. Datasets & Building AGAThA
18 | A sample dataset can be found in `dataset/`.
19 | AGAThA can be built by running the following code:
20 | 
21 | ```properties
22 | cd AGAThA
23 | bash build.sh
24 | cd ..
25 | ```
26 | 
27 | ## AGAThA Details
28 | 
29 | AGAThA was built on top of [GASAL2](https://github.com/nahmedraja/GASAL2).
30 | 
31 | ### 1. AGAThA Options
32 | Using ```AGAThA.sh```, we can use the following options for AGAThA.
33 | ```
34 | -m  the match score
35 | -x  the mismatch penatly
36 | -q  the gap open penalty
37 | -r  the gap extension penalty
38 | -z  the termination threshold
39 | -w  the band width in the score table
40 | ```
41 | ### 2. AGAThA Input
42 | AGAThA requires two datasets as input; 
43 | * a fasta file with reference sequences labeled with sequence indices
44 | * a fasta file with query sequences labeled with sequences indices
45 | Both files should follow the format below:
46 | ```
47 | >>> 1
48 | ATGCN...
49 | >>> 2
50 | TCGGA...
51 | ```
52 | Fasta files can be downloaded from various sources such as [GenBank](https://www.ncbi.nlm.nih.gov/genbank/) or projects such as [Genome in a Bottle](https://www.nist.gov/programs-projects/genome-bottle). 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.7.1-devel-ubuntu22.04
 2 | 
 3 | RUN apt-get update \
 4 |  && apt-get install --assume-yes --no-install-recommends --quiet \
 5 |         python3 \
 6 |         python3-pip \
 7 |         libz-dev \
 8 |         wget
 9 | 
10 | RUN /bin/bash -c "source root/.bashrc"
11 | 
12 | WORKDIR /agatha_ae


--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
1 | docker build -t agatha_ae .


--------------------------------------------------------------------------------
/docker/launch.sh:
--------------------------------------------------------------------------------
1 | #docker run -it --rm --gpus all -v $PWD/../:/agatha_ae agatha_ae:latest /bin/bash
2 | docker run -it --rm --gpus all -v $PWD/../:/agatha_ae agatha_ae:latest /bin/bash


--------------------------------------------------------------------------------
/misc/avg_time.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import json
 4 | 
 5 | # Input: process name | dataset id | raw_file | output_file | number of iterations
 6 | 
 7 | process = sys.argv[1]
 8 | dataset_id = sys.argv[2]
 9 | raw_file = sys.argv[3]
10 | output_file = sys.argv[4]
11 | iter = int(sys.argv[5])
12 | 
13 | # Get average execution time
14 | if os.path.exists(raw_file):
15 |     raw = open(raw_file, "r").read().splitlines()
16 | 
17 |     total_time = 0.0
18 |     
19 |     if len(raw) != 0:
20 |         for r in raw:
21 |             total_time += float(r)
22 |         avg_time = total_time/float(iter)
23 |     else:
24 |         avg_time = "NaN"
25 |     
26 | else:
27 |     avg_time = "NaN"
28 | 
29 | 
30 | # Store result to json file
31 | 
32 | if os.path.exists(output_file):
33 |     with open(output_file, "r") as json_file:
34 |         output = json.load(json_file)
35 | else: 
36 |     output = {}
37 | 
38 | if process not in output:
39 |     output[process] = {}
40 | 
41 | output[process][dataset_id] = avg_time
42 | 
43 | with open(output_file, "w") as json_file:
44 |     json.dump(output, json_file)
45 | 


--------------------------------------------------------------------------------