├── GPU_OLS_C2C_cuFFT_callbacks ├── CONV-32bit_cuFFT.cu ├── CONV_C2C.cpp ├── Makefile ├── When_cuFFT_wins_cuFFT.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_C2C_cuFFT_callbacks_pp ├── CONV-32bit_cuFFT.cu ├── CONV_C2C.cpp ├── Makefile ├── When_cuFFT_wins_cuFFT.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_C2C_sharedmemory ├── CONV-32bit_customFFT.cu ├── CONV_SM_OLS_C2C.cpp ├── Makefile ├── When_cuFFT_wins.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_C2C_sharedmemory_pp ├── CONV-32bit_customFFT.cu ├── CONV_SM_OLS_C2C.cpp ├── Makefile ├── When_cuFFT_wins.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_R2R_cuFFT_callbacks ├── CONV-32bit_cuFFT.cu ├── CONV_R2R.cpp ├── Makefile ├── When_cuFFT_wins_cuFFT.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check_R2R.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_R2R_cuFFT_callbacks_pp ├── CONV-32bit_cuFFT.cu ├── CONV_R2R.cpp ├── Makefile ├── When_cuFFT_wins_cuFFT.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check_R2R.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_R2R_sharedmemory ├── CONV-32bit_customFFT.cu ├── CONV_SM_OLS_R2R.cpp ├── Makefile ├── When_cuFFT_wins.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check_R2R.h ├── debug.h ├── params.h ├── results.h ├── timer.h └── utils_cuda.h ├── GPU_OLS_R2R_sharedmemory_pp ├── CONV-32bit_customFFT.cu ├── CONV_SM_OLS_R2R.cpp ├── Makefile ├── When_cuFFT_wins.sh ├── benchmark_all.sh ├── benchmark_performance.sh ├── conv_check_R2R.h ├── debug.h ├── params.h ├── results.h ├── run_convolution.sh ├── timer.h └── utils_cuda.h ├── LICENSE ├── OLS_generate_files ├── Example_files.cpp └── Makefile ├── README.md └── process_AAFFT_results.R /GPU_OLS_C2C_cuFFT_callbacks/CONV_C2C.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | 9 | #include "debug.h" 10 | #include "params.h" 11 | #include "results.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "conv_check.h" 23 | 24 | void Generate_signal(float2 *h_input, int nTimesamples){ 25 | for(int f=0; f=template_size/2) { 50 | h_filters[t*convolution_size + f - template_size/2].x=h_filters_time[t*template_size + f].x; 51 | h_filters[t*convolution_size + f - template_size/2].y=h_filters_time[t*template_size + f].y; 52 | } 53 | else if(f0){ 114 | *data = (float2*)malloc(file_size*sizeof(float2)); 115 | memset( (*data), 0.0, file_size*sizeof(float2)); 116 | if(*data==NULL){ 117 | printf("\nAllocation error!\n"); 118 | error++; 119 | } 120 | 121 | FILEIN.clear(); 122 | FILEIN.seekg(0,ios::beg); 123 | 124 | for (cislo=0; cislo < file_size; cislo++) { 125 | FILEIN >> real >> imaginary; 126 | (*data)[cislo].x = real; 127 | (*data)[cislo].y = imaginary; 128 | } 129 | } 130 | else { 131 | printf("\nFile is void of any content!\n"); 132 | error++; 133 | } 134 | } 135 | else { 136 | cout << "File not found -> " << filename << " <-" << endl; 137 | error++; 138 | } 139 | FILEIN.close(); 140 | return(error); 141 | } 142 | 143 | 144 | int Load_filters(char *filename, int *nFilters, int *filter_length, float2 **data){ 145 | float real, imaginary; 146 | int file_size, cislo, error, filter_size; 147 | error=0; 148 | 149 | ifstream FILEIN; 150 | FILEIN.open(filename,ios::in); 151 | if (!FILEIN.fail()){ 152 | error=0; 153 | file_size = File_size_row_signal(FILEIN); 154 | (*filter_length) = file_size/(*nFilters); 155 | filter_size = (*nFilters)*(*filter_length); 156 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 157 | 158 | if(file_size>0){ 159 | *data = (float2*)malloc( filter_size*sizeof(float2)); 160 | memset( (*data), 0.0, filter_size*sizeof(float2)); 161 | 162 | if(*data==NULL){ 163 | printf("\nAllocation error!\n"); 164 | error++; 165 | } 166 | 167 | FILEIN.clear(); 168 | FILEIN.seekg(0,ios::beg); 169 | 170 | for (cislo=0; cislo < filter_size; cislo++) { 171 | FILEIN >> real >> imaginary; 172 | (*data)[cislo].x = real; 173 | (*data)[cislo].y = imaginary; 174 | } 175 | } 176 | else { 177 | printf("\nFile is void of any content!\n"); 178 | error++; 179 | } 180 | } 181 | else { 182 | cout << "File not found -> " << filename << " <-" << endl; 183 | error++; 184 | } 185 | FILEIN.close(); 186 | return(error); 187 | } 188 | 189 | 190 | int GPU_CONV(float2 *h_input, float2 *h_output, float2 *h_filters, int signal_length, int filter_length, int nFilters, int nRuns, double *execution_time); 191 | 192 | 193 | int main(int argc, char* argv[]) { 194 | int nTimesamples; // input signal length 195 | int filter_length; // filter length 196 | int nFilters; // number of filters 197 | int nRuns; 198 | char input_type='0'; 199 | char input_filter_file[255]; 200 | char input_signal_file[255]; 201 | char output_signal_file[255]; 202 | 203 | char * pEnd; 204 | if (argc>2) { 205 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 206 | input_type=*argv[1]; 207 | } 208 | if (input_type == 'f' && argc==6) { 209 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 210 | sprintf(input_signal_file,"%s",argv[2]); 211 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 212 | sprintf(input_filter_file,"%s",argv[3]); 213 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 214 | sprintf(output_signal_file,"%s",argv[4]); 215 | nFilters = strtol(argv[5],&pEnd,10); 216 | nRuns = 1; 217 | } 218 | else if (input_type == 'r' && argc==6) { 219 | nTimesamples = strtol(argv[2],&pEnd,10); 220 | filter_length = strtol(argv[3],&pEnd,10); 221 | nFilters = strtol(argv[4],&pEnd,10); 222 | nRuns = strtol(argv[5],&pEnd,10); 223 | } 224 | else { 225 | printf("Parameters error!\n"); 226 | printf(" 1) Input type: 'r' or 'f' \n"); 227 | printf("----------------------------------\n"); 228 | printf("'f' - file input provided by user\n"); 229 | printf(" 2) Input signal file\n"); 230 | printf(" 3) Input filter file\n"); 231 | printf(" 4) Output signal file\n"); 232 | printf(" 5) number of filters\n"); 233 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n"); 234 | printf("----------------------------------\n"); 235 | printf(" 'r' - random input generated by the code\n"); 236 | printf(" 2) Signal length in number of time samples\n"); 237 | printf(" 3) Filter length in samples\n"); 238 | printf(" 4) Number of templates\n"); 239 | printf(" 5) number of GPU kernel runs\n"); 240 | printf(" Example: CONV.exe r 2097152 193 32 10\n"); 241 | return 1; 242 | } 243 | 244 | if (DEBUG) { 245 | printf("Parameters:\n"); 246 | printf("Input signal and templates are "); 247 | if (input_type == 'r') { 248 | printf("randomly generated.\n"); 249 | printf("Signal length: %d samples\n", nTimesamples); 250 | printf("Filter length: %d samples\n", filter_length); 251 | printf("Number of filters: %d\n", nFilters); 252 | printf("nRuns: %d\n", nRuns); 253 | } 254 | if (input_type == 'f') { 255 | printf("read from file.\n"); 256 | printf("Input signal: %s\n", input_signal_file); 257 | printf("Input filter: %s\n", input_filter_file); 258 | printf("Output signal: %s\n", output_signal_file); 259 | printf("nFilters: %d\n", nFilters); 260 | printf("nRuns: %d\n", nRuns); 261 | printf("-----------------\n"); 262 | } 263 | } 264 | 265 | float2 *h_input; // input signal 266 | float2 *h_output; // output plane 267 | float2 *h_filters_padded; // filters in time-domain padded with zeroes 268 | float2 *h_filters; // filters in time-domain 269 | 270 | if (input_type == 'f') { 271 | int error=0; 272 | error += Load_signal(input_signal_file, &nTimesamples, &h_input); 273 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 274 | if( error>0 ){exit(1);} 275 | else if (VERBOSE) printf("File loaded\n"); 276 | } 277 | 278 | if (input_type == 'r') { 279 | h_input = (float2 *)malloc(nTimesamples*sizeof(float2)); 280 | h_filters = (float2 *)malloc(filter_length*nFilters*sizeof(float2)); 281 | srand(time(NULL)); 282 | Generate_signal(h_input, nTimesamples); 283 | Generate_templates(h_filters, filter_length, nFilters); 284 | if (VERBOSE) printf("Signal and filters generated\n"); 285 | } 286 | 287 | size_t filter_size_padded = nFilters*CONV_SIZE; 288 | h_filters_padded = (float2*)malloc(filter_size_padded*sizeof(float2)); 289 | Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters); 290 | 291 | //----------------> Results 292 | double execution_time = 0; 293 | Performance_results CONV_cuFFT; 294 | CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT"); 295 | 296 | int offset = filter_length/2; // we assume that filter is centered around zero 297 | int useful_part_size = CONV_SIZE - filter_length + 1; 298 | int nConvolutions = (nTimesamples + useful_part_size - 1)/useful_part_size; 299 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 300 | if(DEBUG) { 301 | printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions); 302 | } 303 | 304 | size_t output_size = nFilters*useful_part_size*nConvolutions; 305 | h_output = (float2*)malloc(output_size*sizeof(float2)); 306 | 307 | if (VERBOSE) printf("Convolution - cuFFT\n"); 308 | 309 | //----------------> GPU kernel 310 | int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, nRuns, &execution_time); 311 | CONV_cuFFT.GPU_time = execution_time; 312 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 313 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 314 | if(WRITE && GPU_error==0) CONV_cuFFT.Save(); 315 | //----------------> GPU kernel 316 | 317 | if(CHECK){ 318 | double total_error, mean_error; 319 | printf("Checking results...\n"); 320 | Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, &total_error, &mean_error); 321 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 322 | } 323 | 324 | if (input_type == 'f') { 325 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 326 | } 327 | 328 | free(h_input); 329 | free(h_output); 330 | free(h_filters_padded); 331 | free(h_filters); 332 | 333 | cudaDeviceReset(); 334 | 335 | if (VERBOSE) printf("Finished!\n"); 336 | 337 | return (0); 338 | } 339 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | ifdef reglim 18 | NVCCFLAGS += --maxrregcount=$(reglim) 19 | endif 20 | 21 | all: clean analyze 22 | 23 | analyze: CONV_C2C.o CONV-32bit_cuFFT.o Makefile 24 | $(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_C2C.o $(LIB) $(NVCCFLAGS) 25 | 26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h 27 | $(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64 28 | 29 | CONV_C2C.o: CONV_C2C.cpp 30 | $(GCC) -c CONV_C2C.cpp $(GCC_OPTS) 31 | 32 | clean: 33 | rm -f *.o *.~ $(ANALYZE) 34 | 35 | 36 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/When_cuFFT_wins_cuFFT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | echo "#define CONV_SIZE $convsize" > params.h 6 | 7 | rm CONV.exe 8 | make 9 | for tempsize in {64..2048..32} 10 | do 11 | for templates in 32; 12 | do 13 | ./CONV.exe r 2097152 $tempsize $templates 20 0 14 | done 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_cuFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_cuFFT.dat OLS_cuFFT_C2C_perf.dat; 7 | 8 | ./When_cuFFT_wins_cuFFT.sh 9 | mv CONV_cuFFT.dat OLS_cuFFT_C2C_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | 6 | echo "#define CONV_SIZE $convsize" > params.h 7 | rm CONV.exe 8 | make 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 11 16 32 51 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $templates 20 14 | ./CONV.exe r 524288 $tempsize $templates 20 15 | ./CONV.exe r 1048576 $tempsize $templates 20 16 | ./CONV.exe r 2097152 $tempsize $templates 20 17 | ./CONV.exe r 4194304 $tempsize $templates 20 18 | ./CONV.exe r 8388608 $tempsize $templates 20 19 | done 20 | done 21 | done 22 | 23 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/conv_check.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | 13 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int nFilters){ 14 | for(int f=0; f>1)); 24 | if(signal_pos>=0 && signal_posB) { 45 | div_error = A-B; 46 | if(B>10){ 47 | power = (int) log10(B); 48 | order = pow(10,power); 49 | div_error = div_error/order; 50 | } 51 | } 52 | else { 53 | div_error = B-A; 54 | if(A>10){ 55 | power = (int) log10(A); 56 | order = pow(10,power); 57 | div_error = div_error/order; 58 | } 59 | } 60 | 61 | if(div_error max_error ){ 85 | nErrors++; 86 | if(cislo<40){ 87 | printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) (x/useful_part_size), x%useful_part_size); 88 | cislo++; 89 | } 90 | } 91 | } 92 | } 93 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 94 | (*total_error) = total_error_l; 95 | (*mean_error) = mean_error_l; 96 | return(nErrors); 97 | } 98 | 99 | 100 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){ 101 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 102 | float2 *h_CPU_output_timedomain; 103 | h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2)); 104 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2)); 105 | 106 | printf("\n--> Time-domain convolution:"); 107 | CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters); 108 | 109 | float GPU_scale, CPU_scale; 110 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 111 | 112 | printf("\n--> Comparison to CPU time-domain:\n"); 113 | GPU_scale = conv_length; 114 | CPU_scale = 1.0; 115 | GPU_offset = 0; 116 | CPU_offset = 0; 117 | GPU_dim_x = nConvolutions*useful_part_size; 118 | CPU_dim_x = (signal_length + filter_length - 1); 119 | nSamples = signal_length - offset; 120 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 121 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 122 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 123 | else printf("FAILED\n"); 124 | 125 | free(h_CPU_output_timedomain); 126 | } 127 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define CHECK false 4 | #define WRITE true 5 | 6 | #define DEVICEID 0 7 | 8 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/params.h: -------------------------------------------------------------------------------- 1 | #define CONV_SIZE 8192 2 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/CONV_C2C.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | 9 | #include "debug.h" 10 | #include "params.h" 11 | #include "results.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "conv_check.h" 23 | 24 | void Generate_signal(float2 *h_input, int nTimesamples){ 25 | for(int f=0; f=template_size/2) { 50 | h_filters[t*convolution_size + f - template_size/2].x=h_filters_time[t*template_size + f].x; 51 | h_filters[t*convolution_size + f - template_size/2].y=h_filters_time[t*template_size + f].y; 52 | } 53 | else if(f0){ 114 | *data = (float2*)malloc(file_size*sizeof(float2)); 115 | memset( (*data), 0.0, file_size*sizeof(float2)); 116 | if(*data==NULL){ 117 | printf("\nAllocation error!\n"); 118 | error++; 119 | } 120 | 121 | FILEIN.clear(); 122 | FILEIN.seekg(0,ios::beg); 123 | 124 | for (cislo = 0; cislo < file_size; cislo++) { 125 | FILEIN >> real >> imaginary; 126 | (*data)[cislo].x = real; 127 | (*data)[cislo].y = imaginary; 128 | } 129 | } 130 | else { 131 | printf("\nFile is void of any content!\n"); 132 | error++; 133 | } 134 | } 135 | else { 136 | cout << "File not found -> " << filename << " <-" << endl; 137 | error++; 138 | } 139 | FILEIN.close(); 140 | return(error); 141 | } 142 | 143 | 144 | int Load_filters(char *filename, int *nFilters, int *filter_length, float2 **data){ 145 | float real, imaginary; 146 | int file_size, cislo, error, filter_size; 147 | error=0; 148 | 149 | ifstream FILEIN; 150 | FILEIN.open(filename,ios::in); 151 | if (!FILEIN.fail()){ 152 | error=0; 153 | file_size = File_size_row_signal(FILEIN); 154 | (*filter_length) = file_size/(*nFilters); 155 | filter_size = (*nFilters)*(*filter_length); 156 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 157 | 158 | if(file_size>0){ 159 | *data = (float2*)malloc( filter_size*sizeof(float2)); 160 | memset( (*data), 0.0, filter_size*sizeof(float2)); 161 | 162 | if(*data==NULL){ 163 | printf("\nAllocation error!\n"); 164 | error++; 165 | } 166 | 167 | FILEIN.clear(); 168 | FILEIN.seekg(0,ios::beg); 169 | 170 | for (cislo=0; cislo < filter_size; cislo++) { 171 | FILEIN >> real >> imaginary; 172 | (*data)[cislo].x = real; 173 | (*data)[cislo].y = imaginary; 174 | } 175 | } 176 | else { 177 | printf("\nFile is void of any content!\n"); 178 | error++; 179 | } 180 | } 181 | else { 182 | cout << "File not found -> " << filename << " <-" << endl; 183 | error++; 184 | } 185 | FILEIN.close(); 186 | return(error); 187 | } 188 | 189 | 190 | int GPU_CONV(float2 *h_input, float2 *h_output, float2 *h_filters, int signal_length, int filter_length, int nFilters, float h, int nRuns, double *execution_time); 191 | 192 | 193 | int main(int argc, char* argv[]) { 194 | int nTimesamples; // input signal length 195 | int filter_length; // filter length 196 | int nFilters; // number of filters 197 | int nRuns; 198 | char input_type='0'; 199 | char input_filter_file[255]; 200 | char input_signal_file[255]; 201 | char output_signal_file[255]; 202 | 203 | char * pEnd; 204 | if (argc>2) { 205 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 206 | input_type=*argv[1]; 207 | } 208 | if (input_type == 'f' && argc==6) { 209 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 210 | sprintf(input_signal_file,"%s",argv[2]); 211 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 212 | sprintf(input_filter_file,"%s",argv[3]); 213 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 214 | sprintf(output_signal_file,"%s",argv[4]); 215 | nFilters = strtol(argv[5],&pEnd,10); 216 | nRuns = 1; 217 | } 218 | else if (input_type == 'r' && argc==6) { 219 | nTimesamples = strtol(argv[2],&pEnd,10); 220 | filter_length = strtol(argv[3],&pEnd,10); 221 | nFilters = strtol(argv[4],&pEnd,10); 222 | nRuns = strtol(argv[5],&pEnd,10); 223 | } 224 | else { 225 | printf("Parameters error!\n"); 226 | printf(" 1) Input type: 'r' or 'f' \n"); 227 | printf("----------------------------------\n"); 228 | printf("'f' - file input provided by user\n"); 229 | printf(" 2) Input signal file\n"); 230 | printf(" 3) Input filter file\n"); 231 | printf(" 4) Output signal file\n"); 232 | printf(" 5) number of filters\n"); 233 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n"); 234 | printf("----------------------------------\n"); 235 | printf(" 'r' - random input generated by the code\n"); 236 | printf(" 2) Signal length in number of time samples\n"); 237 | printf(" 3) Filter length in samples\n"); 238 | printf(" 4) Number of templates\n"); 239 | printf(" 5) number of GPU kernel runs\n"); 240 | printf(" Example: CONV.exe r 2097152 193 32 10\n"); 241 | return 1; 242 | } 243 | 244 | if (DEBUG) { 245 | printf("Parameters:\n"); 246 | printf("Input signal and templates are "); 247 | if (input_type == 'r') { 248 | printf("randomly generated.\n"); 249 | printf("Signal length: %d samples\n", nTimesamples); 250 | printf("Filter length: %d samples\n", filter_length); 251 | printf("Number of filters: %d\n", nFilters); 252 | printf("nRuns: %d\n", nRuns); 253 | } 254 | if (input_type == 'f') { 255 | printf("read from file.\n"); 256 | printf("Input signal: %s\n", input_signal_file); 257 | printf("Input filter: %s\n", input_filter_file); 258 | printf("Output signal: %s\n", output_signal_file); 259 | printf("nFilters: %d\n", nFilters); 260 | printf("nRuns: %d\n", nRuns); 261 | printf("-----------------\n"); 262 | } 263 | } 264 | 265 | float2 *h_input; // input signal 266 | float2 *h_output; // output plane 267 | float2 *h_filters_padded; // filters in time-domain padded with zeroes 268 | float2 *h_filters; // filters in time-domain 269 | 270 | if (input_type == 'f') { 271 | int error=0; 272 | error += Load_signal(input_signal_file, &nTimesamples, &h_input); 273 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 274 | if( error>0 ){exit(1);} 275 | else if (VERBOSE) printf("File loaded\n"); 276 | } 277 | 278 | if (input_type == 'r') { 279 | h_input = (float2 *)malloc(nTimesamples*sizeof(float2)); 280 | h_filters = (float2 *)malloc(filter_length*nFilters*sizeof(float2)); 281 | srand(time(NULL)); 282 | Generate_signal(h_input, nTimesamples); 283 | Generate_templates(h_filters, filter_length, nFilters); 284 | if (VERBOSE) printf("Signal and filters generated\n"); 285 | } 286 | 287 | size_t filter_size_padded = nFilters*CONV_SIZE; 288 | h_filters_padded = (float2*)malloc(filter_size_padded*sizeof(float2)); 289 | Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters); 290 | 291 | //----------------> Results 292 | double execution_time = 0; 293 | Performance_results CONV_cuFFT; 294 | CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT"); 295 | 296 | int offset = filter_length/2; // we assume that filter is centered around zero 297 | int useful_part_size = CONV_SIZE - filter_length + 1; 298 | int nConvolutions = (nTimesamples + useful_part_size - 1)/useful_part_size; 299 | 300 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 301 | if(DEBUG) { 302 | printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions); 303 | } 304 | 305 | size_t output_size = nFilters*useful_part_size*nConvolutions; 306 | h_output = (float2*)malloc(output_size*sizeof(float2)); 307 | 308 | if (VERBOSE) printf("Convolution - cuFFT\n"); 309 | 310 | //----------------> GPU kernel 311 | float h = 20.0; 312 | int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, h, nRuns, &execution_time); 313 | CONV_cuFFT.GPU_time = execution_time; 314 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 315 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 316 | if(WRITE && GPU_error==0) CONV_cuFFT.Save(); 317 | //----------------> GPU kernel 318 | 319 | if(CHECK){ 320 | double total_error, mean_error; 321 | printf("Checking results...\n"); 322 | Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, h, &total_error, &mean_error); 323 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 324 | } 325 | 326 | if (input_type == 'f') { 327 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 328 | } 329 | 330 | free(h_input); 331 | free(h_output); 332 | free(h_filters_padded); 333 | free(h_filters); 334 | 335 | cudaDeviceReset(); 336 | 337 | if (VERBOSE) printf("Finished!\n"); 338 | 339 | return (0); 340 | } 341 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | ifdef reglim 18 | NVCCFLAGS += --maxrregcount=$(reglim) 19 | endif 20 | 21 | all: clean analyze 22 | 23 | analyze: CONV_C2C.o CONV-32bit_cuFFT.o Makefile 24 | $(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_C2C.o $(LIB) $(NVCCFLAGS) 25 | 26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h 27 | $(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64 28 | 29 | CONV_C2C.o: CONV_C2C.cpp 30 | $(GCC) -c CONV_C2C.cpp $(GCC_OPTS) 31 | 32 | clean: 33 | rm -f *.o *.~ $(ANALYZE) 34 | 35 | 36 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/When_cuFFT_wins_cuFFT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | echo "#define CONV_SIZE $convsize" > params.h 6 | 7 | rm CONV.exe 8 | make 9 | for tempsize in {64..4096..32} 10 | do 11 | for templates in 32; 12 | do 13 | ./CONV.exe r 2097152 $tempsize $templates 20 0 14 | done 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_cuFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_cuFFT.dat OLS_cuFFT_callbacks_pp_perf.dat; 7 | 8 | ./When_cuFFT_wins_cuFFT.sh 9 | mv CONV_cuFFT.dat OLS_cuFFT_callbacks_pp_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | 6 | echo "#define CONV_SIZE $convsize" > params.h 7 | rm CONV.exe 8 | make 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 11 16 32 51 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $templates 20 14 | ./CONV.exe r 524288 $tempsize $templates 20 15 | ./CONV.exe r 1048576 $tempsize $templates 20 16 | ./CONV.exe r 2097152 $tempsize $templates 20 17 | ./CONV.exe r 4194304 $tempsize $templates 20 18 | ./CONV.exe r 8388608 $tempsize $templates 20 19 | done 20 | done 21 | done 22 | 23 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/conv_check.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int nFilters){ 13 | for(int f=0; f>1)); 23 | if(signal_pos>=0 && signal_pos=(nTimesamples-1) ) { 48 | right = h_CPU_output_reduced[f*nTimesamples + nTimesamples - 1]; 49 | } 50 | else { 51 | right = h_CPU_output_reduced[pos+1]; 52 | } 53 | 54 | result.x = (left.x - right.x)/(2.0*h); 55 | result.y = (left.y - right.y)/(2.0*h); 56 | h_CPU_postprocessed[pos] = result; 57 | } 58 | } 59 | } 60 | 61 | 62 | float get_error(float2 A_f2, float2 B_f2){ 63 | float error, div_error=10000, per_error=10000, order=0; 64 | int power; 65 | float A = max(A_f2.x, A_f2.y); 66 | float B = max(B_f2.x, B_f2.y); 67 | if(A<0) A = -A; 68 | if(B<0) B = -B; 69 | 70 | if (A>B) { 71 | div_error = A-B; 72 | if(B>10){ 73 | power = (int) log10(B); 74 | order = pow(10,power); 75 | div_error = div_error/order; 76 | } 77 | } 78 | else { 79 | div_error = B-A; 80 | if(A>10){ 81 | power = (int) log10(A); 82 | order = pow(10,power); 83 | div_error = div_error/order; 84 | } 85 | } 86 | 87 | if(div_error max_error ){ 111 | nErrors++; 112 | if(cislo<40){ 113 | printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) ((x + GPU_offset)/useful_part_size), (x + GPU_offset)%useful_part_size); 114 | cislo++; 115 | } 116 | } 117 | } 118 | } 119 | } 120 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 121 | (*total_error) = total_error_l; 122 | (*mean_error) = mean_error_l; 123 | return(nErrors); 124 | } 125 | 126 | 127 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){ 128 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 129 | float2 *h_CPU_output_timedomain; 130 | float2 *h_CPU_postprocessed; 131 | h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2)); 132 | h_CPU_postprocessed = (float2 *)malloc(output_size_timedomain*sizeof(float2)); 133 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2)); 134 | memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float2)); 135 | 136 | printf("\n--> Time-domain convolution:"); 137 | CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters); 138 | 139 | printf("\n--> Post-processing:\n"); 140 | CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h); 141 | 142 | float GPU_scale, CPU_scale; 143 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 144 | 145 | #ifdef POST_PROCESS 146 | 147 | printf("\n--> Comparison to CPU time-domain with post-processing:\n"); 148 | GPU_scale = conv_length; 149 | CPU_scale = 1.0; 150 | GPU_offset = 0; 151 | CPU_offset = 0; 152 | GPU_dim_x = nConvolutions*useful_part_size; 153 | CPU_dim_x = (signal_length + filter_length - 1); 154 | nSamples = signal_length - offset; 155 | Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 156 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 157 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 158 | else printf("FAILED\n"); 159 | 160 | #else 161 | 162 | printf("\n--> Comparison to CPU time-domain:\n"); 163 | GPU_scale = conv_length; 164 | CPU_scale = 1.0; 165 | GPU_offset = 0; 166 | CPU_offset = 0; 167 | GPU_dim_x = nConvolutions*useful_part_size; 168 | CPU_dim_x = (signal_length + filter_length - 1); 169 | nSamples = signal_length-offset; 170 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 171 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 172 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 173 | else printf("FAILED\n"); 174 | 175 | #endif 176 | 177 | free(h_CPU_output_timedomain); 178 | free(h_CPU_postprocessed); 179 | } 180 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define CHECK false 4 | #define WRITE true 5 | 6 | #define DEVICEID 0 7 | #define POST_PROCESS 8 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/params.h: -------------------------------------------------------------------------------- 1 | #define CONV_SIZE 8192 2 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_cuFFT_callbacks_pp/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/CONV_SM_OLS_C2C.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | #include "debug.h" 9 | #include "params.h" 10 | #include "results.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | #include "conv_check.h" 23 | 24 | 25 | void Generate_random_signal(float2 *h_input, int signal_length){ 26 | for(int f=0; f15000){ 39 | for(int f=15000; f=(nSamples/2-boxcar_width/2) && f<( nSamples/2+boxcar_width/2) ){ 80 | h_filters[t*nSamples + f].x=1; 81 | h_filters[t*nSamples + f].y=0; 82 | } 83 | else { 84 | h_filters[t*nSamples + f].x=0; 85 | h_filters[t*nSamples + f].y=0; 86 | } 87 | } 88 | } 89 | } 90 | 91 | void Pad_templates(float2 *h_filters_time, float2 *h_filters_padded, int filter_length, int corrected_filter_length, int convolution_size, int nFilters){ 92 | float2 *tmp_filter; 93 | tmp_filter = new float2[corrected_filter_length]; 94 | 95 | for(int f=0; f=corrected_filter_length/2) { 120 | h_filters_padded[t*convolution_size + f - corrected_filter_length/2].x=tmp_filter[f].x; 121 | h_filters_padded[t*convolution_size + f - corrected_filter_length/2].y=tmp_filter[f].y; 122 | } 123 | else if(f0){ 186 | *data = (float2*)malloc(file_size*sizeof(float2)); 187 | memset( (*data), 0.0, file_size*sizeof(float2)); 188 | if(*data==NULL){ 189 | printf("\nAllocation error!\n"); 190 | error++; 191 | } 192 | 193 | FILEIN.clear(); 194 | FILEIN.seekg(0,ios::beg); 195 | 196 | for (cislo = 0; cislo < file_size; cislo++) { 197 | FILEIN >> real >> imaginary; 198 | (*data)[cislo].x = real; 199 | (*data)[cislo].y = imaginary; 200 | } 201 | } 202 | else { 203 | printf("\nFile is void of any content!\n"); 204 | error++; 205 | } 206 | } 207 | else { 208 | cout << "File not found -> " << filename << " <-" << endl; 209 | error++; 210 | } 211 | FILEIN.close(); 212 | return(error); 213 | } 214 | 215 | 216 | int Load_filters(char *filename, int *nFilters, int *filter_length, float2 **data){ 217 | float real, imaginary; 218 | int file_size, cislo, error, filter_size; 219 | error=0; 220 | 221 | ifstream FILEIN; 222 | FILEIN.open(filename,ios::in); 223 | if (!FILEIN.fail()){ 224 | error=0; 225 | file_size = File_size_row_signal(FILEIN); 226 | (*filter_length) = file_size/(*nFilters); 227 | filter_size = (*nFilters)*(*filter_length); 228 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 229 | 230 | if(file_size>0){ 231 | *data = (float2*)malloc( filter_size*sizeof(float2)); 232 | memset( (*data), 0.0, filter_size*sizeof(float2)); 233 | 234 | if(*data==NULL){ 235 | printf("\nAllocation error!\n"); 236 | error++; 237 | } 238 | 239 | FILEIN.clear(); 240 | FILEIN.seekg(0,ios::beg); 241 | 242 | for (cislo=0; cislo < filter_size; cislo++) { 243 | FILEIN >> real >> imaginary; 244 | (*data)[cislo].x = real; 245 | (*data)[cislo].y = imaginary; 246 | } 247 | } 248 | else { 249 | printf("\nFile is void of any content!\n"); 250 | error++; 251 | } 252 | } 253 | else { 254 | cout << "File not found -> " << filename << " <-" << endl; 255 | error++; 256 | } 257 | FILEIN.close(); 258 | return(error); 259 | } 260 | 261 | 262 | int GPU_convolution_OLS_customFFT(float2 *h_input_signal, float2 *h_output_plane, float2 *h_filters, int signal_length, int convolution_length, int filter_length, int past_filter_samples, int nFilters, int nRuns, int kernel_type, double *execution_time); 263 | 264 | 265 | int main(int argc, char* argv[]) { 266 | int signal_length; 267 | int filter_length; 268 | int past_filter_samples; 269 | int convolution_length; 270 | int nFilters; 271 | int nRuns; 272 | char input_type='0'; 273 | char input_filter_file[255]; 274 | char input_signal_file[255]; 275 | char output_signal_file[255]; 276 | 277 | char * pEnd; 278 | if (argc>2) { 279 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 280 | input_type=*argv[1]; 281 | } 282 | if (input_type == 'f' && argc==8) { 283 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 284 | sprintf(input_signal_file,"%s",argv[2]); 285 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 286 | sprintf(input_filter_file,"%s",argv[3]); 287 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 288 | sprintf(output_signal_file,"%s",argv[4]); 289 | 290 | convolution_length = strtol(argv[5],&pEnd,10); 291 | nFilters = strtol(argv[6],&pEnd,10); 292 | past_filter_samples = strtol(argv[7],&pEnd,10); 293 | nRuns = 1; 294 | } 295 | else if (input_type == 'r' && argc==8) { 296 | signal_length = strtol(argv[2],&pEnd,10); 297 | filter_length = strtol(argv[3],&pEnd,10); 298 | past_filter_samples = strtol(argv[4],&pEnd,10); 299 | convolution_length = strtol(argv[5],&pEnd,10); 300 | nFilters = strtol(argv[6],&pEnd,10); 301 | 302 | nRuns = strtol(argv[7],&pEnd,10); 303 | } 304 | else { 305 | printf("Parameters error!\n"); 306 | printf(" 1) Input type: 'r' or 'f' \n"); 307 | printf("----------------------------------\n"); 308 | printf("Parameters if input type is 'f' - file input provided by user\n"); 309 | printf(" 2) Input signal file\n"); 310 | printf(" 3) Input filter file\n"); 311 | printf(" 4) Output signal file\n"); 312 | printf(" 5) Convolution length in samples\n"); 313 | printf(" 6) number of filters\n"); 314 | printf(" 7) number of past samples in the filter.\n"); 315 | printf(" for past filter (causal) it is (filter_length - 1)\n"); 316 | printf(" for odd centered filter it is floor(filter_length/2)\n"); 317 | printf(" for future filter it is 0\n"); 318 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192\n"); 319 | printf("----------------------------------\n"); 320 | printf("Parameters if input type is 'r' - random input generated by the code\n"); 321 | printf(" 2) Signal length in number of time samples\n"); 322 | printf(" 3) Filter length in samples\n"); 323 | printf(" 4) number of past samples in the filter.\n"); 324 | printf(" for past filter (causal) it is (filter_length - 1)\n"); 325 | printf(" for odd centered filter it is floor(filter_length/2)\n"); 326 | printf(" for future filter it is 0\n"); 327 | printf(" 5) Convolution length in samples\n"); 328 | printf(" 6) Number of filters\n"); 329 | printf(" 7) number of GPU kernel runs\n"); 330 | printf(" Example: CONV.exe r 2097152 193 192 2048 32 10\n"); 331 | return 1; 332 | } 333 | 334 | if (DEBUG) { 335 | printf("Parameters:\n"); 336 | printf("Input signal and filters are "); 337 | if (input_type == 'r') { 338 | printf("randomly generated.\n"); 339 | printf("Signal length: %d samples\n", signal_length); 340 | printf("Filter length: %d samples\n", filter_length); 341 | printf("# of past samples: %d samples\n", past_filter_samples); 342 | printf("Convolution length: %d samples\n", convolution_length); 343 | printf("Number of filters: %d\n", nFilters); 344 | printf("nRuns: %d\n", nRuns); 345 | } 346 | if (input_type == 'f') { 347 | printf("read from file.\n"); 348 | printf("Input signal: %s\n", input_signal_file); 349 | printf("Input filter: %s\n", input_filter_file); 350 | printf("Output signal: %s\n", output_signal_file); 351 | printf("Convolution length: %d samples\n", convolution_length); 352 | printf("nFilters: %d\n", nFilters); 353 | printf("# of past samples: %d samples\n", past_filter_samples); 354 | printf("nRuns: %d\n", nRuns); 355 | printf("-----------------\n"); 356 | } 357 | } 358 | 359 | float2 *h_input; 360 | float2 *h_output; 361 | float2 *h_filters; // filters in time-domain 362 | float2 *h_filters_padded; // filters in time-domain padded with zeroes 363 | 364 | if (input_type == 'f') { 365 | int error=0; 366 | error += Load_signal(input_signal_file, &signal_length, &h_input); 367 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 368 | if( error>0 ){exit(1);} 369 | else if (VERBOSE) printf("File loaded\n"); 370 | } 371 | 372 | //----------------> Results 373 | double execution_time = 0; 374 | Performance_results CONV_cuFFT; 375 | CONV_cuFFT.Assign(signal_length, filter_length, nFilters, nRuns, 0, convolution_length, nFilters, "CONV_kFFT.dat", "one"); 376 | 377 | 378 | int corrected_filter_length; 379 | if( filter_length%2==0 ) corrected_filter_length = filter_length + 1; 380 | else corrected_filter_length = filter_length; 381 | int useful_part_size = convolution_length - corrected_filter_length + 1; 382 | int nConvolutions = signal_length/useful_part_size; 383 | if( (signal_length%useful_part_size)>0 ) nConvolutions++; 384 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 385 | 386 | 387 | if (input_type == 'r') { 388 | h_input = (float2 *)malloc(signal_length*sizeof(float2)); 389 | h_filters = (float2 *)malloc(filter_length*nFilters*sizeof(float2)); 390 | srand(time(NULL)); 391 | Generate_random_signal(h_input, signal_length); 392 | Generate_random_filter(h_filters, filter_length, nFilters); 393 | if (VERBOSE) printf("Signal and filters generated\n"); 394 | } 395 | 396 | size_t filter_size_padded = nFilters*convolution_length; 397 | h_filters_padded = (float2*)malloc(filter_size_padded*sizeof(float2)); 398 | Pad_templates(h_filters, h_filters_padded, filter_length, corrected_filter_length, convolution_length, nFilters); 399 | 400 | size_t output_size = nFilters*useful_part_size*nConvolutions; 401 | h_output = (float2*)malloc(output_size*sizeof(float2)); 402 | 403 | if (VERBOSE) printf("Convolution - kFFT\n"); 404 | 405 | //----------------> GPU kernel 406 | int kernel_type=1; //one filter per iteration 407 | GPU_convolution_OLS_customFFT(h_input, h_output, h_filters_padded, signal_length, convolution_length, corrected_filter_length, past_filter_samples, nFilters, nRuns, kernel_type, &execution_time); 408 | CONV_cuFFT.GPU_time = execution_time; 409 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 410 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 411 | if(WRITE) CONV_cuFFT.Save(); 412 | //----------------> GPU kernel 413 | 414 | if(CHECK){ 415 | double total_error, mean_error; 416 | printf("Checking results...\n"); 417 | Full_CONV_check(h_output, h_input, h_filters, signal_length, filter_length, past_filter_samples, useful_part_size, convolution_length, nConvolutions, nFilters, &total_error, &mean_error); 418 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 419 | } 420 | 421 | if (input_type == 'f') { 422 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 423 | } 424 | 425 | free(h_input); 426 | free(h_output); 427 | free(h_filters_padded); 428 | free(h_filters); 429 | 430 | cudaDeviceReset(); 431 | 432 | if (VERBOSE) printf("Finished!\n"); 433 | 434 | return (0); 435 | } 436 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | 18 | ifdef reglim 19 | NVCCFLAGS += --maxrregcount=$(reglim) 20 | endif 21 | 22 | all: clean onefilter 23 | 24 | onefilter: CONV_SM_OLS_C2C.o CONV-32bit_customFFT.o Makefile 25 | $(NVCC) -o CONV.exe CONV_SM_OLS_C2C.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS) 26 | 27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h 28 | $(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS) 29 | 30 | CONV_SM_OLS_C2C.o: CONV_SM_OLS_C2C.cpp 31 | $(GCC) -c CONV_SM_OLS_C2C.cpp $(GCC_OPTS) 32 | 33 | clean: 34 | rm -f *.o *.~ CONV_*.exe 35 | 36 | 37 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/When_cuFFT_wins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_1f.exe; 4 | rm CONV_2f.exe; 5 | rm *.o; 6 | make reglim=0 > /dev/null 2>&1 7 | for convlength in 256 512 1024 2048 4096; 8 | do 9 | for tempsize in {64..4096..32} 10 | do 11 | ./CONV.exe r 2097152 $tempsize $convlength 32 20 12 | done 13 | done 14 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_kFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_kFFT.dat OLS_SM_C2C_perf.dat; 7 | 8 | ./When_cuFFT_wins.sh 9 | mv CONV_kFFT.dat OLS_SM_C2C_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | rm CONV.exe; 5 | rm *.o; 6 | make reglim=$reg > /dev/null 2>&1 7 | for convlength in 256 512 1024 2048 4096 8 | do 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 16 32 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $convlength $templates 20 14 | ./CONV.exe r 524288 $tempsize $convlength $templates 20 15 | ./CONV.exe r 1048576 $tempsize $convlength $templates 20 16 | ./CONV.exe r 2097152 $tempsize $convlength $templates 20 17 | ./CONV.exe r 4194304 $tempsize $convlength $templates 20 18 | ./CONV.exe r 8388608 $tempsize $convlength $templates 20 19 | done 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/conv_check.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){ 13 | for(int f=0; f=0 && signal_posB) { 44 | div_error = A-B; 45 | if(B>10){ 46 | power = (int) log10(B); 47 | order = pow(10,power); 48 | div_error = div_error/order; 49 | } 50 | } 51 | else { 52 | div_error = B-A; 53 | if(A>10){ 54 | power = (int) log10(A); 55 | order = pow(10,power); 56 | div_error = div_error/order; 57 | } 58 | } 59 | 60 | if(div_error max_error ){ 83 | nErrors++; 84 | if(cislo<40){ 85 | printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) (x/useful_part_size), x%useful_part_size); 86 | cislo++; 87 | } 88 | } 89 | } 90 | } 91 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 92 | (*total_error) = total_error_l; 93 | (*mean_error) = mean_error_l; 94 | return(nErrors); 95 | } 96 | 97 | 98 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){ 99 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 100 | float2 *h_CPU_output_timedomain; 101 | h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2)); 102 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2)); 103 | 104 | printf("\n--> Time-domain convolution:"); 105 | CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters); 106 | 107 | float GPU_scale, CPU_scale; 108 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 109 | 110 | printf("\n--> Comparison to CPU time-domain:\n"); 111 | GPU_scale = conv_length; 112 | GPU_scale = 1.0; 113 | CPU_scale = 1.0; 114 | GPU_offset = 0; 115 | CPU_offset = 0; 116 | GPU_dim_x = nConvolutions*useful_part_size; 117 | CPU_dim_x = (signal_length + filter_length - 1); 118 | nSamples = signal_length; 119 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 120 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 121 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 122 | else printf("FAILED\n"); 123 | 124 | free(h_CPU_output_timedomain); 125 | } 126 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define WRITE true 4 | #define CHECK false 5 | 6 | #define DEVICEID 0 7 | 8 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/params.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_C2C_sharedmemory/params.h -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_60 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | 18 | ifdef reglim 19 | NVCCFLAGS += --maxrregcount=$(reglim) 20 | endif 21 | 22 | all: clean onefilter 23 | 24 | onefilter: CONV_SM_OLS_C2C.o CONV-32bit_customFFT.o Makefile 25 | $(NVCC) -o CONV.exe CONV_SM_OLS_C2C.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS) 26 | 27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h 28 | $(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS) 29 | 30 | CONV_SM_OLS_C2C.o: CONV_SM_OLS_C2C.cpp 31 | $(GCC) -c CONV_SM_OLS_C2C.cpp $(GCC_OPTS) 32 | 33 | clean: 34 | rm -f *.o *.~ CONV_*.exe 35 | 36 | 37 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/When_cuFFT_wins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV.exe; 4 | rm *.o; 5 | make reglim=0 > /dev/null 2>&1 6 | for convlength in 256 512 1024 2048 4096; 7 | do 8 | for tempsize in {64..4096..32} 9 | do 10 | ./CONV.exe r 2097152 $tempsize $convlength 32 20 11 | done 12 | done 13 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_kFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_kFFT.dat OLS_SM_C2C_pp_perf.dat; 7 | 8 | ./When_cuFFT_wins.sh 9 | mv CONV_kFFT.dat OLS_SM_C2C_pp_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | rm CONV.exe; 5 | rm *.o; 6 | make reglim=$reg > /dev/null 2>&1 7 | for convlength in 256 512 1024 2048 4096 8 | do 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 16 32 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $convlength $templates 20 14 | ./CONV.exe r 524288 $tempsize $convlength $templates 20 15 | ./CONV.exe r 1048576 $tempsize $convlength $templates 20 16 | ./CONV.exe r 2097152 $tempsize $convlength $templates 20 17 | ./CONV.exe r 4194304 $tempsize $convlength $templates 20 18 | ./CONV.exe r 8388608 $tempsize $convlength $templates 20 19 | done 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/conv_check.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){ 13 | for(int f=0; f=0 && signal_pos=(nTimesamples-1) ) { 48 | right = h_CPU_output_reduced[f*nTimesamples + nTimesamples - 1]; 49 | } 50 | else { 51 | right = h_CPU_output_reduced[pos+1]; 52 | } 53 | 54 | result.x = (left.x - right.x)/(2.0*h); 55 | result.y = (left.y - right.y)/(2.0*h); 56 | h_CPU_postprocessed[pos] = result; 57 | } 58 | } 59 | } 60 | 61 | 62 | float get_error(float2 A_f2, float2 B_f2){ 63 | float error, div_error=10000, per_error=10000, order=0; 64 | int power; 65 | float A = max(A_f2.x, A_f2.y); 66 | float B = max(B_f2.x, B_f2.y); 67 | if(A<0) A = -A; 68 | if(B<0) B = -B; 69 | 70 | if (A>B) { 71 | div_error = A-B; 72 | if(B>10){ 73 | power = (int) log10(B); 74 | order = pow(10,power); 75 | div_error = div_error/order; 76 | } 77 | } 78 | else { 79 | div_error = B-A; 80 | if(A>10){ 81 | power = (int) log10(A); 82 | order = pow(10,power); 83 | div_error = div_error/order; 84 | } 85 | } 86 | 87 | if(div_error max_error ){ 111 | nErrors++; 112 | if(cislo<40){ 113 | printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) ((x + GPU_offset)/useful_part_size), (x + GPU_offset)%useful_part_size); 114 | cislo++; 115 | } 116 | } 117 | } 118 | } 119 | } 120 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 121 | (*total_error) = total_error_l; 122 | (*mean_error) = mean_error_l; 123 | return(nErrors); 124 | } 125 | 126 | 127 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){ 128 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 129 | float2 *h_CPU_output_timedomain; 130 | float2 *h_CPU_postprocessed; 131 | h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2)); 132 | h_CPU_postprocessed = (float2 *)malloc(output_size_timedomain*sizeof(float2)); 133 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2)); 134 | memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float2)); 135 | 136 | printf("\n--> Time-domain convolution:"); 137 | CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters); 138 | 139 | printf("\n--> Post-processing:\n"); 140 | CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h); 141 | 142 | float GPU_scale, CPU_scale; 143 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 144 | 145 | #ifdef POST_PROCESS 146 | 147 | printf("\n--> Comparison to CPU time-domain with post-processing:\n"); 148 | GPU_scale = 1.0; 149 | CPU_scale = 1.0; 150 | GPU_offset = 0; 151 | CPU_offset = 0; 152 | GPU_dim_x = nConvolutions*useful_part_size; 153 | CPU_dim_x = (signal_length + filter_length - 1); 154 | nSamples = signal_length - offset; 155 | Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 156 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 157 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 158 | else printf("FAILED\n"); 159 | #else 160 | 161 | printf("\n--> Comparison to CPU time-domain:\n"); 162 | GPU_scale = conv_length; 163 | GPU_scale = 1.0; 164 | CPU_scale = 1.0; 165 | GPU_offset = 0; 166 | CPU_offset = 0; 167 | GPU_dim_x = nConvolutions*useful_part_size; 168 | CPU_dim_x = (signal_length + filter_length - 1); 169 | nSamples = signal_length-offset; 170 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 171 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 172 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 173 | else printf("FAILED\n"); 174 | 175 | #endif 176 | 177 | free(h_CPU_output_timedomain); 178 | free(h_CPU_postprocessed); 179 | } 180 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define WRITE true 4 | #define CHECK true 5 | 6 | #define DEVICEID 0 7 | //#define POST_PROCESS 8 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/params.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_C2C_sharedmemory_pp/params.h -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_C2C_sharedmemory_pp/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/CONV_R2R.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | 9 | #include "debug.h" 10 | #include "params.h" 11 | #include "results.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "conv_check_R2R.h" 23 | 24 | 25 | void Generate_signal(float *h_input, int signal_length){ 26 | for(int f=0; f=template_size/2) { 48 | h_filters[t*convolution_size + f - template_size/2] = h_filters_time[t*template_size + f]; 49 | } 50 | else if(f0){ 110 | *data = (float*)malloc(file_size*sizeof(float)); 111 | memset( (*data), 0.0, file_size*sizeof(float)); 112 | if(*data==NULL){ 113 | printf("\nAllocation error!\n"); 114 | error++; 115 | } 116 | 117 | FILEIN.clear(); 118 | FILEIN.seekg(0,ios::beg); 119 | 120 | for (cislo = 0; cislo < file_size; cislo++) { 121 | FILEIN >> real >> imaginary; 122 | (*data)[cislo] = sqrt(real*real + imaginary*imaginary); 123 | } 124 | } 125 | else { 126 | printf("\nFile is void of any content!\n"); 127 | error++; 128 | } 129 | } 130 | else { 131 | cout << "File not found -> " << filename << " <-" << endl; 132 | error++; 133 | } 134 | FILEIN.close(); 135 | return(error); 136 | } 137 | 138 | 139 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){ 140 | float real, imaginary; 141 | int file_size, cislo, error, filter_size; 142 | error=0; 143 | 144 | ifstream FILEIN; 145 | FILEIN.open(filename,ios::in); 146 | if (!FILEIN.fail()){ 147 | error=0; 148 | file_size = File_size_row_signal(FILEIN); 149 | (*filter_length) = file_size/(*nFilters); 150 | filter_size = (*nFilters)*(*filter_length); 151 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 152 | 153 | if(file_size>0){ 154 | *data = (float*)malloc( filter_size*sizeof(float)); 155 | memset( (*data), 0.0, filter_size*sizeof(float)); 156 | 157 | if(*data==NULL){ 158 | printf("\nAllocation error!\n"); 159 | error++; 160 | } 161 | 162 | FILEIN.clear(); 163 | FILEIN.seekg(0,ios::beg); 164 | 165 | for (cislo=0; cislo < filter_size; cislo++) { 166 | FILEIN >> real >> imaginary; 167 | (*data)[cislo] = real; 168 | } 169 | } 170 | else { 171 | printf("\nFile is void of any content!\n"); 172 | error++; 173 | } 174 | } 175 | else { 176 | cout << "File not found -> " << filename << " <-" << endl; 177 | error++; 178 | } 179 | FILEIN.close(); 180 | return(error); 181 | } 182 | 183 | 184 | int GPU_CONV(float *h_input, float *h_output, float *h_filters_timedom, int signal_length, int filter_length, int nFilters, int nRuns, double *execution_time); 185 | 186 | 187 | int main(int argc, char* argv[]) { 188 | int nTimesamples; // input signal length 189 | int filter_length; // filter length 190 | int nFilters; // number of filters 191 | int nRuns; 192 | char input_type='0'; 193 | char input_filter_file[255]; 194 | char input_signal_file[255]; 195 | char output_signal_file[255]; 196 | 197 | char * pEnd; 198 | if (argc>2) { 199 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 200 | input_type=*argv[1]; 201 | } 202 | if (input_type == 'f' && argc==6) { 203 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 204 | sprintf(input_signal_file,"%s",argv[2]); 205 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 206 | sprintf(input_filter_file,"%s",argv[3]); 207 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 208 | sprintf(output_signal_file,"%s",argv[4]); 209 | nFilters = strtol(argv[5],&pEnd,10); 210 | nRuns = 1; 211 | } 212 | else if (input_type == 'r' && argc==6) { 213 | nTimesamples = strtol(argv[2],&pEnd,10); 214 | filter_length = strtol(argv[3],&pEnd,10); 215 | nFilters = strtol(argv[4],&pEnd,10); 216 | nRuns = strtol(argv[5],&pEnd,10); 217 | } 218 | else { 219 | printf("Parameters error!\n"); 220 | printf(" 1) Input type: 'r' or 'f' \n"); 221 | printf("----------------------------------\n"); 222 | printf("'f' - file input provided by user\n"); 223 | printf(" 2) Input signal file\n"); 224 | printf(" 3) Input filter file\n"); 225 | printf(" 4) Output signal file\n"); 226 | printf(" 5) number of filters\n"); 227 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n"); 228 | printf("----------------------------------\n"); 229 | printf(" 'r' - random input generated by the code\n"); 230 | printf(" 2) Signal length in number of time samples\n"); 231 | printf(" 3) Filter length in samples\n"); 232 | printf(" 4) Number of templates\n"); 233 | printf(" 5) number of GPU kernel runs\n"); 234 | printf(" Example: CONV.exe r 2097152 193 32 10\n"); 235 | return 1; 236 | } 237 | 238 | if (DEBUG) { 239 | printf("Parameters:\n"); 240 | printf("Input signal and templates are "); 241 | if (input_type == 'r') { 242 | printf("randomly generated.\n"); 243 | printf("Signal length: %d samples\n", nTimesamples); 244 | printf("Filter length: %d samples\n", filter_length); 245 | printf("Number of filters: %d\n", nFilters); 246 | printf("nRuns: %d\n", nRuns); 247 | } 248 | if (input_type == 'f') { 249 | printf("read from file.\n"); 250 | printf("Input signal: %s\n", input_signal_file); 251 | printf("Input filter: %s\n", input_filter_file); 252 | printf("Output signal: %s\n", output_signal_file); 253 | printf("nFilters: %d\n", nFilters); 254 | printf("nRuns: %d\n", nRuns); 255 | printf("-----------------\n"); 256 | } 257 | } 258 | 259 | float *h_input; // input signal 260 | float *h_output; // output plane 261 | float *h_filters_padded; // filters in time-domain padded with zeroes 262 | float *h_filters; // filters in time-domain 263 | 264 | if (input_type == 'f') { 265 | int error=0; 266 | error += Load_signal(input_signal_file, &nTimesamples, &h_input); 267 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 268 | if( error>0 ){exit(1);} 269 | else if (VERBOSE) printf("File loaded\n"); 270 | } 271 | 272 | if (input_type == 'r') { 273 | h_input = (float *)malloc(nTimesamples*sizeof(float)); 274 | h_filters = (float *)malloc(filter_length*nFilters*sizeof(float)); 275 | srand(time(NULL)); 276 | Generate_signal(h_input, nTimesamples); 277 | Generate_templates(h_filters, filter_length, nFilters); 278 | if (VERBOSE) printf("Signal and filters generated\n"); 279 | } 280 | 281 | size_t filter_size_padded = nFilters*CONV_SIZE; 282 | h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float)); 283 | Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters); 284 | 285 | //----------------> Results 286 | double execution_time = 0; 287 | Performance_results CONV_cuFFT; 288 | CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT"); 289 | 290 | int offset = filter_length/2; // we assume that filter is centered around zero 291 | int useful_part_size = CONV_SIZE - filter_length + 1; 292 | int nConvolutions = (nTimesamples + useful_part_size - 1)/useful_part_size; 293 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 294 | if(DEBUG) { 295 | printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions); 296 | } 297 | 298 | size_t output_size = nFilters*useful_part_size*nConvolutions; 299 | h_output = (float*)malloc(output_size*sizeof(float)); 300 | 301 | if (VERBOSE) printf("Convolution - cuFFT\n"); 302 | 303 | //----------------> GPU kernel 304 | int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, nRuns, &execution_time); 305 | CONV_cuFFT.GPU_time = execution_time; 306 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 307 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 308 | if(WRITE && GPU_error==0) CONV_cuFFT.Save(); 309 | //----------------> GPU kernel 310 | 311 | if(CHECK){ 312 | double total_error, mean_error; 313 | printf("Checking results...\n"); 314 | Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, &total_error, &mean_error); 315 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 316 | } 317 | 318 | if (input_type == 'f') { 319 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 320 | } 321 | 322 | free(h_input); 323 | free(h_output); 324 | free(h_filters_padded); 325 | free(h_filters); 326 | 327 | cudaDeviceReset(); 328 | 329 | if (VERBOSE) printf("Finished!\n"); 330 | 331 | return (0); 332 | } 333 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | ifdef reglim 18 | NVCCFLAGS += --maxrregcount=$(reglim) 19 | endif 20 | 21 | all: clean analyze 22 | 23 | analyze: CONV_R2R.o CONV-32bit_cuFFT.o Makefile 24 | $(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_R2R.o $(LIB) $(NVCCFLAGS) 25 | 26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h 27 | $(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64 28 | 29 | CONV_R2R.o: CONV_R2R.cpp 30 | $(GCC) -c CONV_R2R.cpp $(GCC_OPTS) 31 | 32 | clean: 33 | rm -f *.o *.~ $(ANALYZE) 34 | 35 | 36 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/When_cuFFT_wins_cuFFT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | echo "#define CONV_SIZE $convsize" > params.h 6 | 7 | rm CONV.exe 8 | make 9 | for tempsize in {64..2048..32} 10 | do 11 | for templates in 32; 12 | do 13 | ./CONV.exe r 2097152 $tempsize $templates 20 0 14 | done 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_cuFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_perf.dat; 7 | 8 | ./When_cuFFT_wins_cuFFT.sh 9 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | 6 | echo "#define CONV_SIZE $convsize" > params.h 7 | rm CONV.exe 8 | make 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 11 16 32 51 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $templates 20 14 | ./CONV.exe r 524288 $tempsize $templates 20 15 | ./CONV.exe r 1048576 $tempsize $templates 20 16 | ./CONV.exe r 2097152 $tempsize $templates 20 17 | ./CONV.exe r 4194304 $tempsize $templates 20 18 | ./CONV.exe r 8388608 $tempsize $templates 20 19 | done 20 | done 21 | done 22 | 23 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/conv_check_R2R.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int nFilters){ 13 | for(int f=0; f>1)); 23 | if(signal_pos>=0 && signal_posB) { 40 | div_error = A-B; 41 | if(B>10){ 42 | power = (int) log10(B); 43 | order = pow(10,power); 44 | div_error = div_error/order; 45 | } 46 | } 47 | else { 48 | div_error = B-A; 49 | if(A>10){ 50 | power = (int) log10(A); 51 | order = pow(10,power); 52 | div_error = div_error/order; 53 | } 54 | } 55 | 56 | if(div_error max_error ){ 73 | nErrors++; 74 | if(cislo<40){ 75 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size)); 76 | cislo++; 77 | } 78 | } 79 | } 80 | } 81 | mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y)); 82 | (*total_error) = total_error_l; 83 | (*mean_error) = mean_error_l; 84 | return(nErrors); 85 | } 86 | 87 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){ 88 | double total_error_l = 0, mean_error_l = 0; 89 | size_t nErrors = 0; 90 | int cislo = 0; 91 | float error; 92 | 93 | for(int y=0; y max_error ){ 105 | nErrors++; 106 | if(cislo<40){ 107 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size); 108 | cislo++; 109 | } 110 | } 111 | } 112 | } 113 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 114 | (*total_error) = total_error_l; 115 | (*mean_error) = mean_error_l; 116 | return(nErrors); 117 | } 118 | 119 | 120 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){ 121 | float GPU_scale, CPU_scale; 122 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 123 | 124 | //----------------------- CPU time-domain 125 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 126 | float *h_CPU_output_timedomain; 127 | h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float)); 128 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float)); 129 | 130 | printf("\n--> Time-domain convolution:"); 131 | CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters); 132 | 133 | printf("\n--> Comparison to CPU time-domain:\n"); 134 | GPU_scale = conv_length/2; 135 | CPU_scale = 1.0; 136 | GPU_offset = 0; 137 | CPU_offset = 0; 138 | GPU_dim_x = nConvolutions*useful_part_size; 139 | CPU_dim_x = (signal_length + filter_length - 1); 140 | nSamples = signal_length - offset; 141 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 142 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 143 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 144 | else printf("FAILED\n"); 145 | 146 | 147 | free(h_CPU_output_timedomain); 148 | //-------------------------------------------------< 149 | } 150 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define WRITE true 4 | #define CHECK false 5 | 6 | #define DEVICEID 0 7 | 8 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/params.h: -------------------------------------------------------------------------------- 1 | #define CONV_SIZE 8192 2 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/CONV_R2R.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | #include "debug.h" 9 | #include "params.h" 10 | #include "results.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "conv_check_R2R.h" 22 | 23 | 24 | void Generate_signal(float *h_input, int signal_length){ 25 | for(int f=0; f=template_size/2) { 47 | h_filters[t*convolution_size + f - template_size/2] = h_filters_time[t*template_size + f]; 48 | } 49 | else if(f0){ 109 | *data = (float*)malloc(file_size*sizeof(float)); 110 | memset( (*data), 0.0, file_size*sizeof(float)); 111 | if(*data==NULL){ 112 | printf("\nAllocation error!\n"); 113 | error++; 114 | } 115 | 116 | FILEIN.clear(); 117 | FILEIN.seekg(0,ios::beg); 118 | 119 | for (cislo = 0; cislo < file_size; cislo++) { 120 | FILEIN >> real >> imaginary; 121 | (*data)[cislo] = sqrt(real*real + imaginary*imaginary); 122 | } 123 | } 124 | else { 125 | printf("\nFile is void of any content!\n"); 126 | error++; 127 | } 128 | } 129 | else { 130 | cout << "File not found -> " << filename << " <-" << endl; 131 | error++; 132 | } 133 | FILEIN.close(); 134 | return(error); 135 | } 136 | 137 | 138 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){ 139 | float real, imaginary; 140 | int file_size, cislo, error, filter_size; 141 | error=0; 142 | 143 | ifstream FILEIN; 144 | FILEIN.open(filename,ios::in); 145 | if (!FILEIN.fail()){ 146 | error=0; 147 | file_size = File_size_row_signal(FILEIN); 148 | (*filter_length) = file_size/(*nFilters); 149 | filter_size = (*nFilters)*(*filter_length); 150 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 151 | 152 | if(file_size>0){ 153 | *data = (float*)malloc( filter_size*sizeof(float)); 154 | memset( (*data), 0.0, filter_size*sizeof(float)); 155 | 156 | if(*data==NULL){ 157 | printf("\nAllocation error!\n"); 158 | error++; 159 | } 160 | 161 | FILEIN.clear(); 162 | FILEIN.seekg(0,ios::beg); 163 | 164 | for (cislo=0; cislo < filter_size; cislo++) { 165 | FILEIN >> real >> imaginary; 166 | (*data)[cislo] = real; 167 | } 168 | } 169 | else { 170 | printf("\nFile is void of any content!\n"); 171 | error++; 172 | } 173 | } 174 | else { 175 | cout << "File not found -> " << filename << " <-" << endl; 176 | error++; 177 | } 178 | FILEIN.close(); 179 | return(error); 180 | } 181 | 182 | 183 | int GPU_CONV(float *h_input, float *h_output, float *h_filters_timedom, int signal_length, int filter_length, int nFilters, int nRuns, float h, double *execution_time); 184 | 185 | 186 | int main(int argc, char* argv[]) { 187 | int nTimesamples; // input signal length 188 | int filter_length; // filter length 189 | int nFilters; // number of filters 190 | int nRuns; 191 | char input_type='0'; 192 | char input_filter_file[255]; 193 | char input_signal_file[255]; 194 | char output_signal_file[255]; 195 | 196 | char * pEnd; 197 | if (argc>2) { 198 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 199 | input_type=*argv[1]; 200 | } 201 | if (input_type == 'f' && argc==6) { 202 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 203 | sprintf(input_signal_file,"%s",argv[2]); 204 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 205 | sprintf(input_filter_file,"%s",argv[3]); 206 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 207 | sprintf(output_signal_file,"%s",argv[4]); 208 | nFilters = strtol(argv[5],&pEnd,10); 209 | nRuns = 1; 210 | } 211 | else if (input_type == 'r' && argc==6) { 212 | nTimesamples = strtol(argv[2],&pEnd,10); 213 | filter_length = strtol(argv[3],&pEnd,10); 214 | nFilters = strtol(argv[4],&pEnd,10); 215 | nRuns = strtol(argv[5],&pEnd,10); 216 | } 217 | else { 218 | printf("Parameters error!\n"); 219 | printf(" 1) Input type: 'r' or 'f' \n"); 220 | printf("----------------------------------\n"); 221 | printf("'f' - file input provided by user\n"); 222 | printf(" 2) Input signal file\n"); 223 | printf(" 3) Input filter file\n"); 224 | printf(" 4) Output signal file\n"); 225 | printf(" 5) number of filters\n"); 226 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n"); 227 | printf("----------------------------------\n"); 228 | printf(" 'r' - random input generated by the code\n"); 229 | printf(" 2) Signal length in number of time samples\n"); 230 | printf(" 3) Filter length in samples\n"); 231 | printf(" 4) Number of templates\n"); 232 | printf(" 5) number of GPU kernel runs\n"); 233 | printf(" Example: CONV.exe r 2097152 193 32 10\n"); 234 | return 1; 235 | } 236 | 237 | if (DEBUG) { 238 | printf("Parameters:\n"); 239 | printf("Input signal and templates are "); 240 | if (input_type == 'r') { 241 | printf("randomly generated.\n"); 242 | printf("Signal length: %d samples\n", nTimesamples); 243 | printf("Filter length: %d samples\n", filter_length); 244 | printf("Number of filters: %d\n", nFilters); 245 | printf("nRuns: %d\n", nRuns); 246 | } 247 | if (input_type == 'f') { 248 | printf("read from file.\n"); 249 | printf("Input signal: %s\n", input_signal_file); 250 | printf("Input filter: %s\n", input_filter_file); 251 | printf("Output signal: %s\n", output_signal_file); 252 | printf("nFilters: %d\n", nFilters); 253 | printf("nRuns: %d\n", nRuns); 254 | printf("-----------------\n"); 255 | } 256 | } 257 | 258 | float *h_input; // input signal 259 | float *h_output; // output plane 260 | float *h_filters_padded; // filters in time-domain padded with zeroes 261 | float *h_filters; // filters in time-domain 262 | 263 | if (input_type == 'f') { 264 | int error=0; 265 | error += Load_signal(input_signal_file, &nTimesamples, &h_input); 266 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 267 | if( error>0 ){exit(1);} 268 | else if (VERBOSE) printf("File loaded\n"); 269 | } 270 | 271 | if (input_type == 'r') { 272 | h_input = (float *)malloc(nTimesamples*sizeof(float)); 273 | h_filters = (float *)malloc(filter_length*nFilters*sizeof(float)); 274 | srand(time(NULL)); 275 | Generate_signal(h_input, nTimesamples); 276 | Generate_templates(h_filters, filter_length, nFilters); 277 | if (VERBOSE) printf("Signal and filters generated\n"); 278 | } 279 | 280 | size_t filter_size_padded = nFilters*CONV_SIZE; 281 | h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float)); 282 | Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters); 283 | 284 | //----------------> Results 285 | double execution_time = 0; 286 | Performance_results CONV_cuFFT; 287 | CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT"); 288 | 289 | int offset = filter_length/2; // we assume that filter is centered around zero 290 | int useful_part_size = CONV_SIZE - filter_length + 1; 291 | int nConvolutions = (nTimesamples + useful_part_size - 1)/useful_part_size; 292 | 293 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 294 | if(DEBUG) { 295 | printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions); 296 | } 297 | 298 | size_t output_size = nFilters*useful_part_size*nConvolutions; 299 | h_output = (float*)malloc(output_size*sizeof(float)); 300 | 301 | if (VERBOSE) printf("Convolution - cuFFT\n"); 302 | 303 | //----------------> GPU kernel 304 | float h = 20.0; 305 | int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, nRuns, h, &execution_time); 306 | CONV_cuFFT.GPU_time = execution_time; 307 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 308 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 309 | if(WRITE && GPU_error==0) CONV_cuFFT.Save(); 310 | //----------------> GPU kernel 311 | 312 | if(CHECK){ 313 | double total_error, mean_error; 314 | printf("Checking results...\n"); 315 | Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, h, &total_error, &mean_error); 316 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 317 | } 318 | 319 | if (input_type == 'f') { 320 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 321 | } 322 | 323 | free(h_input); 324 | free(h_output); 325 | free(h_filters_padded); 326 | free(h_filters); 327 | 328 | cudaDeviceReset(); 329 | 330 | if (VERBOSE) printf("Finished!\n"); 331 | 332 | return (0); 333 | } 334 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | ifdef reglim 18 | NVCCFLAGS += --maxrregcount=$(reglim) 19 | endif 20 | 21 | all: clean analyze 22 | 23 | analyze: CONV_R2R.o CONV-32bit_cuFFT.o Makefile 24 | $(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_R2R.o $(LIB) $(NVCCFLAGS) 25 | 26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h 27 | $(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64 28 | 29 | CONV_R2R.o: CONV_R2R.cpp 30 | $(GCC) -c CONV_R2R.cpp $(GCC_OPTS) 31 | 32 | clean: 33 | rm -f *.o *.~ $(ANALYZE) 34 | 35 | 36 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/When_cuFFT_wins_cuFFT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | echo "#define CONV_SIZE $convsize" > params.h 6 | 7 | rm CONV.exe 8 | make 9 | for tempsize in {64..4096..32} 10 | do 11 | for templates in 32; 12 | do 13 | ./CONV.exe r 2097152 $tempsize $templates 20 0 14 | done 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_cuFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_pp_perf.dat; 7 | 8 | ./When_cuFFT_wins_cuFFT.sh 9 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_pp_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for convsize in 1024 2048 4096 8192 16384; 4 | do 5 | 6 | echo "#define CONV_SIZE $convsize" > params.h 7 | rm CONV.exe 8 | make 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 11 16 32 51 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $templates 20 14 | ./CONV.exe r 524288 $tempsize $templates 20 15 | ./CONV.exe r 1048576 $tempsize $templates 20 16 | ./CONV.exe r 2097152 $tempsize $templates 20 17 | ./CONV.exe r 4194304 $tempsize $templates 20 18 | ./CONV.exe r 8388608 $tempsize $templates 20 19 | done 20 | done 21 | done 22 | 23 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/conv_check_R2R.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int nFilters){ 13 | for(int f=0; f>1)); 23 | if(signal_pos>=0 && signal_posB) { 68 | div_error = A-B; 69 | if(B>10){ 70 | power = (int) log10(B); 71 | order = pow(10,power); 72 | div_error = div_error/order; 73 | } 74 | } 75 | else { 76 | div_error = B-A; 77 | if(A>10){ 78 | power = (int) log10(A); 79 | order = pow(10,power); 80 | div_error = div_error/order; 81 | } 82 | } 83 | 84 | if(div_error max_error ){ 101 | nErrors++; 102 | if(cislo<40){ 103 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; ratio=%f;\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size), CPU_result[pos]/GPU_result[pos]); 104 | cislo++; 105 | } 106 | } 107 | } 108 | } 109 | mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y)); 110 | (*total_error) = total_error_l; 111 | (*mean_error) = mean_error_l; 112 | return(nErrors); 113 | } 114 | 115 | 116 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){ 117 | double total_error_l = 0, mean_error_l = 0; 118 | size_t nErrors = 0; 119 | int cislo = 0; 120 | float error; 121 | 122 | for(int y=0; y max_error ){ 134 | nErrors++; 135 | if(cislo<40){ 136 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size); 137 | cislo++; 138 | } 139 | } 140 | } 141 | } 142 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 143 | (*total_error) = total_error_l; 144 | (*mean_error) = mean_error_l; 145 | return(nErrors); 146 | } 147 | 148 | 149 | 150 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){ 151 | 152 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 153 | float *h_CPU_output_timedomain; 154 | float *h_CPU_postprocessed; 155 | h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float)); 156 | h_CPU_postprocessed = (float *)malloc(output_size_timedomain*sizeof(float)); 157 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float)); 158 | memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float)); 159 | 160 | printf("\n--> Time-domain convolution:"); 161 | CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters); 162 | CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h); 163 | 164 | float GPU_scale, CPU_scale; 165 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 166 | #ifdef POST_PROCESS 167 | 168 | printf("\n--> Comparison to CPU time-domain with post-processing:\n"); 169 | GPU_scale = conv_length/2; 170 | CPU_scale = 1.0; 171 | GPU_offset = 0; 172 | CPU_offset = 0; 173 | GPU_dim_x = nConvolutions*useful_part_size; 174 | CPU_dim_x = (signal_length + filter_length - 1); 175 | nSamples = signal_length - offset; 176 | Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 177 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 178 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 179 | else printf("FAILED\n"); 180 | 181 | #else 182 | 183 | printf("\n--> Comparison to CPU time-domain:\n"); 184 | GPU_scale = conv_length/2; 185 | CPU_scale = 1.0; 186 | GPU_offset = 0; 187 | CPU_offset = 0; 188 | GPU_dim_x = nConvolutions*useful_part_size; 189 | CPU_dim_x = (signal_length + filter_length - 1); 190 | nSamples = signal_length-offset; 191 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 192 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 193 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 194 | else printf("FAILED\n"); 195 | 196 | #endif 197 | 198 | free(h_CPU_postprocessed); 199 | free(h_CPU_output_timedomain); 200 | } 201 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define WRITE true 4 | #define CHECK false 5 | 6 | #define DEVICEID 0 7 | #define POST_PROCESS 8 | 9 | 10 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/params.h: -------------------------------------------------------------------------------- 1 | #define CONV_SIZE 8192 2 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_cuFFT_callbacks_pp/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/CONV_SM_OLS_R2R.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | #include "debug.h" 9 | #include "params.h" 10 | #include "results.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "conv_check_R2R.h" 22 | 23 | void Generate_signal(float *h_input, int signal_length){ 24 | for(int f=0; f=corrected_filter_length/2) { 61 | h_filters_padded[t*convolution_size + f - corrected_filter_length/2] = tmp_filter[f]; 62 | } 63 | else if(f0){ 123 | *data = (float*)malloc(file_size*sizeof(float)); 124 | memset( (*data), 0.0, file_size*sizeof(float)); 125 | if(*data==NULL){ 126 | printf("\nAllocation error!\n"); 127 | error++; 128 | } 129 | 130 | FILEIN.clear(); 131 | FILEIN.seekg(0,ios::beg); 132 | 133 | for (cislo = 0; cislo < file_size; cislo++) { 134 | FILEIN >> real >> imaginary; 135 | (*data)[cislo] = sqrt(real*real + imaginary*imaginary); 136 | } 137 | } 138 | else { 139 | printf("\nFile is void of any content!\n"); 140 | error++; 141 | } 142 | } 143 | else { 144 | cout << "File not found -> " << filename << " <-" << endl; 145 | error++; 146 | } 147 | FILEIN.close(); 148 | return(error); 149 | } 150 | 151 | 152 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){ 153 | float real, imaginary; 154 | int file_size, cislo, error, filter_size; 155 | error=0; 156 | 157 | ifstream FILEIN; 158 | FILEIN.open(filename,ios::in); 159 | if (!FILEIN.fail()){ 160 | error=0; 161 | file_size = File_size_row_signal(FILEIN); 162 | (*filter_length) = file_size/(*nFilters); 163 | filter_size = (*nFilters)*(*filter_length); 164 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 165 | 166 | if(file_size>0){ 167 | *data = (float*)malloc( filter_size*sizeof(float)); 168 | memset( (*data), 0.0, filter_size*sizeof(float)); 169 | 170 | if(*data==NULL){ 171 | printf("\nAllocation error!\n"); 172 | error++; 173 | } 174 | 175 | FILEIN.clear(); 176 | FILEIN.seekg(0,ios::beg); 177 | 178 | for (cislo=0; cislo < filter_size; cislo++) { 179 | FILEIN >> real >> imaginary; 180 | (*data)[cislo] = real; 181 | } 182 | } 183 | else { 184 | printf("\nFile is void of any content!\n"); 185 | error++; 186 | } 187 | } 188 | else { 189 | cout << "File not found -> " << filename << " <-" << endl; 190 | error++; 191 | } 192 | FILEIN.close(); 193 | return(error); 194 | } 195 | 196 | 197 | int GPU_convolution_OLS_customFFT(float *h_input_signal, float *h_output_plane, float *h_filters, int signal_length, int convolution_length, int filter_length, int past_filter_samples, int nFilters, int nRuns, int device, double *execution_time); 198 | 199 | 200 | int main(int argc, char* argv[]) { 201 | int signal_length; 202 | int filter_length; 203 | int past_filter_samples; 204 | int convolution_length; 205 | int nFilters; 206 | int nRuns; 207 | char input_type='0'; 208 | char input_filter_file[255]; 209 | char input_signal_file[255]; 210 | char output_signal_file[255]; 211 | 212 | char * pEnd; 213 | if (argc>2) { 214 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 215 | input_type=*argv[1]; 216 | } 217 | if (input_type == 'f' && argc==8) { 218 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 219 | sprintf(input_signal_file,"%s",argv[2]); 220 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 221 | sprintf(input_filter_file,"%s",argv[3]); 222 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 223 | sprintf(output_signal_file,"%s",argv[4]); 224 | 225 | convolution_length = strtol(argv[5],&pEnd,10); 226 | nFilters = strtol(argv[6],&pEnd,10); 227 | past_filter_samples = strtol(argv[7],&pEnd,10); 228 | nRuns = 1; 229 | } 230 | else if (input_type == 'r' && argc==8) { 231 | signal_length = strtol(argv[2],&pEnd,10); 232 | filter_length = strtol(argv[3],&pEnd,10); 233 | past_filter_samples = strtol(argv[4],&pEnd,10); 234 | convolution_length = strtol(argv[5],&pEnd,10); 235 | nFilters = strtol(argv[6],&pEnd,10); 236 | 237 | nRuns = strtol(argv[7],&pEnd,10); 238 | } 239 | else { 240 | printf("Parameters error!\n"); 241 | printf(" 1) Input type: 'r' or 'f' \n"); 242 | printf("----------------------------------\n"); 243 | printf("Parameters if input type is 'f' - file input provided by user\n"); 244 | printf(" 2) Input signal file\n"); 245 | printf(" 3) Input filter file\n"); 246 | printf(" 4) Output signal file\n"); 247 | printf(" 5) Convolution length in samples\n"); 248 | printf(" 6) number of filters\n"); 249 | printf(" 7) number of past samples in the filter.\n"); 250 | printf(" for past filter (causal) it is (filter_length - 1)\n"); 251 | printf(" for odd centered filter it is floor(filter_length/2)\n"); 252 | printf(" for future filter it is 0\n"); 253 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192\n"); 254 | printf("----------------------------------\n"); 255 | printf("Parameters if input type is 'r' - random input generated by the code\n"); 256 | printf(" 2) Signal length in number of time samples\n"); 257 | printf(" 3) Filter length in samples\n"); 258 | printf(" 4) number of past samples in the filter.\n"); 259 | printf(" for past filter (causal) it is (filter_length - 1)\n"); 260 | printf(" for odd centered filter it is floor(filter_length/2)\n"); 261 | printf(" for future filter it is 0\n"); 262 | printf(" 5) Convolution length in samples\n"); 263 | printf(" 6) Number of filters\n"); 264 | printf(" 7) number of GPU kernel runs\n"); 265 | printf(" Example: CONV.exe r 2097152 193 192 2048 32 10\n"); 266 | return 1; 267 | } 268 | 269 | if (DEBUG) { 270 | printf("Parameters:\n"); 271 | printf("Input signal and filters are "); 272 | if (input_type == 'r') { 273 | printf("randomly generated.\n"); 274 | printf("Signal length: %d samples\n", signal_length); 275 | printf("Filter length: %d samples\n", filter_length); 276 | printf("# of past samples: %d samples\n", past_filter_samples); 277 | printf("Convolution length: %d samples\n", convolution_length); 278 | printf("Number of filters: %d\n", nFilters); 279 | printf("nRuns: %d\n", nRuns); 280 | } 281 | if (input_type == 'f') { 282 | printf("read from file.\n"); 283 | printf("Input signal: %s\n", input_signal_file); 284 | printf("Input filter: %s\n", input_filter_file); 285 | printf("Output signal: %s\n", output_signal_file); 286 | printf("Convolution length: %d samples\n", convolution_length); 287 | printf("nFilters: %d\n", nFilters); 288 | printf("# of past samples: %d samples\n", past_filter_samples); 289 | printf("nRuns: %d\n", nRuns); 290 | printf("-----------------\n"); 291 | } 292 | } 293 | 294 | float *h_input; 295 | float *h_output; 296 | float *h_filters; // filters in time-domain 297 | float *h_filters_padded; // filters in time-domain padded with zeroes 298 | 299 | if (input_type == 'f') { 300 | int error=0; 301 | error += Load_signal(input_signal_file, &signal_length, &h_input); 302 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 303 | if( error>0 ){exit(1);} 304 | else if (VERBOSE) printf("File loaded\n"); 305 | } 306 | 307 | //----------------> Results 308 | double execution_time = 0; 309 | Performance_results CONV_cuFFT; 310 | CONV_cuFFT.Assign(signal_length, filter_length, nFilters, nRuns, 0, convolution_length, nFilters, "CONV_R2R_kFFT.dat", "one"); 311 | 312 | int corrected_filter_length; 313 | if( filter_length%2==0 ) corrected_filter_length = filter_length + 1; 314 | else corrected_filter_length = filter_length; 315 | int useful_part_size = convolution_length - corrected_filter_length + 1; 316 | useful_part_size = 2*(useful_part_size>>1); 317 | int nConvolutions = (signal_length + useful_part_size - 1)/useful_part_size; 318 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 319 | 320 | 321 | if (input_type == 'r') { 322 | h_input = (float *)malloc(signal_length*sizeof(float)); 323 | h_filters = (float *)malloc(filter_length*nFilters*sizeof(float)); 324 | srand(time(NULL)); 325 | Generate_signal(h_input, signal_length); 326 | Generate_random_filter(h_filters, filter_length, nFilters); 327 | if (VERBOSE) printf("Signal and filters generated\n"); 328 | } 329 | 330 | size_t filter_size_padded = nFilters*convolution_length; 331 | h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float)); 332 | Pad_templates(h_filters, h_filters_padded, filter_length, corrected_filter_length, convolution_length, nFilters); 333 | 334 | size_t output_size = nFilters*useful_part_size*nConvolutions; 335 | h_output = (float*)malloc(output_size*sizeof(float)); 336 | 337 | if (VERBOSE) printf("Convolution - kFFT\n"); 338 | 339 | //----------------> GPU kernel 340 | GPU_convolution_OLS_customFFT(h_input, h_output, h_filters_padded, signal_length, convolution_length, corrected_filter_length, past_filter_samples, nFilters, nRuns, DEVICEID, &execution_time); 341 | CONV_cuFFT.GPU_time = execution_time; 342 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 343 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 344 | if(WRITE) CONV_cuFFT.Save(); 345 | //----------------> GPU kernel 346 | 347 | if(CHECK){ 348 | double total_error, mean_error; 349 | printf("Checking results...\n"); 350 | Full_CONV_check(h_output, h_input, h_filters, signal_length, filter_length, past_filter_samples, useful_part_size, (filter_length>>1), convolution_length, nConvolutions, nFilters, &total_error, &mean_error); 351 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 352 | } 353 | 354 | if (input_type == 'f') { 355 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 356 | } 357 | 358 | free(h_input); 359 | free(h_output); 360 | free(h_filters_padded); 361 | free(h_filters); 362 | 363 | cudaDeviceReset(); 364 | 365 | if (VERBOSE) printf("Finished!\n"); 366 | 367 | return (0); 368 | } 369 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | 18 | ifdef reglim 19 | NVCCFLAGS += --maxrregcount=$(reglim) 20 | endif 21 | 22 | all: clean onefilter 23 | 24 | onefilter: CONV_SM_OLS_R2R.o CONV-32bit_customFFT.o Makefile 25 | $(NVCC) -o CONV.exe CONV_SM_OLS_R2R.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS) 26 | 27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h 28 | $(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS) 29 | 30 | CONV_SM_OLS_R2R.o: CONV_SM_OLS_R2R.cpp 31 | $(GCC) -c CONV_SM_OLS_R2R.cpp $(GCC_OPTS) 32 | 33 | clean: 34 | rm -f *.o *.~ CONV.exe 35 | 36 | 37 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/When_cuFFT_wins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_1f.exe; 4 | rm CONV_2f.exe; 5 | rm *.o; 6 | make reglim=0 > /dev/null 2>&1 7 | for convlength in 256 512 1024 2048 4096; 8 | do 9 | for tempsize in {64..4096..32} 10 | do 11 | ./CONV.exe r 2097152 $tempsize $convlength 32 20 12 | done 13 | done 14 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_R2R_kFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_perf.dat; 7 | 8 | ./When_cuFFT_wins.sh 9 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | rm CONV.exe; 5 | rm *.o; 6 | make reglim=$reg > /dev/null 2>&1 7 | for convlength in 512 1024 2048 4096 8 | do 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 16 32 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $convlength $templates 20 14 | ./CONV.exe r 524288 $tempsize $convlength $templates 20 15 | ./CONV.exe r 1048576 $tempsize $convlength $templates 20 16 | ./CONV.exe r 2097152 $tempsize $convlength $templates 20 17 | ./CONV.exe r 4194304 $tempsize $convlength $templates 20 18 | ./CONV.exe r 8388608 $tempsize $convlength $templates 20 19 | done 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/conv_check_R2R.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){ 13 | for(int f=0; f=0 && signal_posB) { 40 | div_error = A-B; 41 | if(B>10){ 42 | power = (int) log10(B); 43 | order = pow(10,power); 44 | div_error = div_error/order; 45 | } 46 | } 47 | else { 48 | div_error = B-A; 49 | if(A>10){ 50 | power = (int) log10(A); 51 | order = pow(10,power); 52 | div_error = div_error/order; 53 | } 54 | } 55 | 56 | if(div_error max_error ){ 73 | nErrors++; 74 | if(cislo<40){ 75 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size)); 76 | cislo++; 77 | } 78 | } 79 | } 80 | } 81 | mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y)); 82 | (*total_error) = total_error_l; 83 | (*mean_error) = mean_error_l; 84 | return(nErrors); 85 | } 86 | 87 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){ 88 | double total_error_l = 0, mean_error_l = 0; 89 | size_t nErrors = 0; 90 | int cislo = 0; 91 | float error; 92 | 93 | for(int y=0; y max_error ){ 105 | nErrors++; 106 | if(cislo<40){ 107 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size); 108 | cislo++; 109 | } 110 | } 111 | } 112 | } 113 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 114 | (*total_error) = total_error_l; 115 | (*mean_error) = mean_error_l; 116 | return(nErrors); 117 | } 118 | 119 | 120 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){ 121 | float GPU_scale, CPU_scale; 122 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 123 | 124 | 125 | //----------------------- CPU time-domain 126 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 127 | float *h_CPU_output_timedomain; 128 | h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float)); 129 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float)); 130 | 131 | CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters); 132 | 133 | printf("\n--> Comparison to CPU time-domain:\n"); 134 | GPU_scale = conv_length/2; 135 | GPU_scale = 1.0; 136 | CPU_scale = 1.0; 137 | GPU_offset = 0; 138 | CPU_offset = 0; 139 | GPU_dim_x = nConvolutions*useful_part_size; 140 | CPU_dim_x = (signal_length + filter_length - 1); 141 | nSamples = signal_length - offset; 142 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 143 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 144 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 145 | else printf("FAILED\n"); 146 | 147 | 148 | free(h_CPU_output_timedomain); 149 | //-------------------------------------------------< 150 | } 151 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define CHECK true 4 | #define WRITE true 5 | 6 | #define DEVICEID 0 7 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/params.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_R2R_sharedmemory/params.h -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/CONV_SM_OLS_R2R.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2019 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | #include "debug.h" 9 | #include "params.h" 10 | #include "results.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "conv_check_R2R.h" 22 | 23 | void Generate_signal(float *h_input, int signal_length){ 24 | for(int f=0; f=corrected_filter_length/2) { 61 | h_filters_padded[t*convolution_size + f - corrected_filter_length/2] = tmp_filter[f]; 62 | } 63 | else if(f0){ 123 | *data = (float*)malloc(file_size*sizeof(float)); 124 | memset( (*data), 0.0, file_size*sizeof(float)); 125 | if(*data==NULL){ 126 | printf("\nAllocation error!\n"); 127 | error++; 128 | } 129 | 130 | FILEIN.clear(); 131 | FILEIN.seekg(0,ios::beg); 132 | 133 | for (cislo = 0; cislo < file_size; cislo++) { 134 | FILEIN >> real >> imaginary; 135 | (*data)[cislo] = sqrt(real*real + imaginary*imaginary); 136 | } 137 | } 138 | else { 139 | printf("\nFile is void of any content!\n"); 140 | error++; 141 | } 142 | } 143 | else { 144 | cout << "File not found -> " << filename << " <-" << endl; 145 | error++; 146 | } 147 | FILEIN.close(); 148 | return(error); 149 | } 150 | 151 | 152 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){ 153 | float real, imaginary; 154 | int file_size, cislo, error, filter_size; 155 | error=0; 156 | 157 | ifstream FILEIN; 158 | FILEIN.open(filename,ios::in); 159 | if (!FILEIN.fail()){ 160 | error=0; 161 | file_size = File_size_row_signal(FILEIN); 162 | (*filter_length) = file_size/(*nFilters); 163 | filter_size = (*nFilters)*(*filter_length); 164 | printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size); 165 | 166 | if(file_size>0){ 167 | *data = (float*)malloc( filter_size*sizeof(float)); 168 | memset( (*data), 0.0, filter_size*sizeof(float)); 169 | 170 | if(*data==NULL){ 171 | printf("\nAllocation error!\n"); 172 | error++; 173 | } 174 | 175 | FILEIN.clear(); 176 | FILEIN.seekg(0,ios::beg); 177 | 178 | for (cislo=0; cislo < filter_size; cislo++) { 179 | FILEIN >> real >> imaginary; 180 | (*data)[cislo] = real; 181 | } 182 | } 183 | else { 184 | printf("\nFile is void of any content!\n"); 185 | error++; 186 | } 187 | } 188 | else { 189 | cout << "File not found -> " << filename << " <-" << endl; 190 | error++; 191 | } 192 | FILEIN.close(); 193 | return(error); 194 | } 195 | 196 | 197 | int GPU_convolution_OLS_customFFT(float *h_input_signal, float *h_output_plane, float *h_filters, int signal_length, int convolution_length, int filter_length, int past_filter_samples, int nFilters, int nRuns, float h, int offset_modifier, int device, double *execution_time); 198 | 199 | 200 | int main(int argc, char* argv[]) { 201 | int signal_length; 202 | int filter_length; 203 | int past_filter_samples; 204 | int convolution_length; 205 | int nFilters; 206 | int nRuns; 207 | char input_type='0'; 208 | char input_filter_file[255]; 209 | char input_signal_file[255]; 210 | char output_signal_file[255]; 211 | 212 | char * pEnd; 213 | if (argc>2) { 214 | if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);} 215 | input_type=*argv[1]; 216 | } 217 | if (input_type == 'f' && argc==8) { 218 | if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);} 219 | sprintf(input_signal_file,"%s",argv[2]); 220 | if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);} 221 | sprintf(input_filter_file,"%s",argv[3]); 222 | if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);} 223 | sprintf(output_signal_file,"%s",argv[4]); 224 | 225 | convolution_length = strtol(argv[5],&pEnd,10); 226 | nFilters = strtol(argv[6],&pEnd,10); 227 | past_filter_samples = strtol(argv[7],&pEnd,10); 228 | nRuns = 1; 229 | } 230 | else if (input_type == 'r' && argc==8) { 231 | signal_length = strtol(argv[2],&pEnd,10); 232 | filter_length = strtol(argv[3],&pEnd,10); 233 | past_filter_samples = strtol(argv[4],&pEnd,10); 234 | convolution_length = strtol(argv[5],&pEnd,10); 235 | nFilters = strtol(argv[6],&pEnd,10); 236 | 237 | nRuns = strtol(argv[7],&pEnd,10); 238 | } 239 | else { 240 | printf("Parameters error!\n"); 241 | printf(" 1) Input type: 'r' or 'f' \n"); 242 | printf("----------------------------------\n"); 243 | printf("Parameters if input type is 'f' - file input provided by user\n"); 244 | printf(" 2) Input signal file\n"); 245 | printf(" 3) Input filter file\n"); 246 | printf(" 4) Output signal file\n"); 247 | printf(" 5) Convolution length in samples\n"); 248 | printf(" 6) number of filters\n"); 249 | printf(" 7) number of past samples in the filter.\n"); 250 | printf(" for past filter (causal) it is (filter_length - 1)\n"); 251 | printf(" for odd centered filter it is floor(filter_length/2)\n"); 252 | printf(" for future filter it is 0\n"); 253 | printf(" Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192\n"); 254 | printf("----------------------------------\n"); 255 | printf("Parameters if input type is 'r' - random input generated by the code\n"); 256 | printf(" 2) Signal length in number of time samples\n"); 257 | printf(" 3) Filter length in samples\n"); 258 | printf(" 4) number of past samples in the filter.\n"); 259 | printf(" for past filter (causal) it is (filter_length - 1)\n"); 260 | printf(" for odd centered filter it is floor(filter_length/2)\n"); 261 | printf(" for future filter it is 0\n"); 262 | printf(" 5) Convolution length in samples\n"); 263 | printf(" 6) Number of filters\n"); 264 | printf(" 7) number of GPU kernel runs\n"); 265 | printf(" Example: CONV.exe r 2097152 193 192 2048 32 10\n"); 266 | return 1; 267 | } 268 | 269 | if (DEBUG) { 270 | printf("Parameters:\n"); 271 | printf("Input signal and filters are "); 272 | if (input_type == 'r') { 273 | printf("randomly generated.\n"); 274 | printf("Signal length: %d samples\n", signal_length); 275 | printf("Filter length: %d samples\n", filter_length); 276 | printf("# of past samples: %d samples\n", past_filter_samples); 277 | printf("Convolution length: %d samples\n", convolution_length); 278 | printf("Number of filters: %d\n", nFilters); 279 | printf("nRuns: %d\n", nRuns); 280 | } 281 | if (input_type == 'f') { 282 | printf("read from file.\n"); 283 | printf("Input signal: %s\n", input_signal_file); 284 | printf("Input filter: %s\n", input_filter_file); 285 | printf("Output signal: %s\n", output_signal_file); 286 | printf("Convolution length: %d samples\n", convolution_length); 287 | printf("nFilters: %d\n", nFilters); 288 | printf("# of past samples: %d samples\n", past_filter_samples); 289 | printf("nRuns: %d\n", nRuns); 290 | printf("-----------------\n"); 291 | } 292 | } 293 | 294 | #ifdef POST_PROCESS 295 | float h=20.0; 296 | int offset_modifier = 4; 297 | #else 298 | float h=1.0; 299 | int offset_modifier = 0; 300 | #endif 301 | 302 | float *h_input; 303 | float *h_output; 304 | float *h_filters; // filters in time-domain 305 | float *h_filters_padded; // filters in time-domain padded with zeroes 306 | 307 | if (input_type == 'f') { 308 | int error=0; 309 | error += Load_signal(input_signal_file, &signal_length, &h_input); 310 | error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters); 311 | if( error>0 ){exit(1);} 312 | else if (VERBOSE) printf("File loaded\n"); 313 | } 314 | 315 | //----------------> Results 316 | double execution_time = 0; 317 | Performance_results CONV_cuFFT; 318 | CONV_cuFFT.Assign(signal_length, filter_length, nFilters, nRuns, 0, convolution_length, nFilters, "CONV_R2R_kFFT.dat", "one"); 319 | 320 | int corrected_filter_length; 321 | if( filter_length%2==0 ) corrected_filter_length = filter_length + 1; 322 | else corrected_filter_length = filter_length; 323 | int useful_part_size = convolution_length - (corrected_filter_length + offset_modifier) + 1; 324 | useful_part_size = 2*(useful_part_size>>1); 325 | int nConvolutions = (signal_length + useful_part_size - 1)/useful_part_size; 326 | if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);} 327 | 328 | 329 | if (input_type == 'r') { 330 | h_input = (float *)malloc(signal_length*sizeof(float)); 331 | h_filters = (float *)malloc(filter_length*nFilters*sizeof(float)); 332 | srand(time(NULL)); 333 | Generate_signal(h_input, signal_length); 334 | Generate_random_filter(h_filters, filter_length, nFilters); 335 | if (VERBOSE) printf("Signal and filters generated\n"); 336 | } 337 | 338 | size_t filter_size_padded = nFilters*convolution_length; 339 | h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float)); 340 | Pad_templates(h_filters, h_filters_padded, filter_length, corrected_filter_length, convolution_length, nFilters); 341 | 342 | size_t output_size = nFilters*useful_part_size*nConvolutions; 343 | h_output = (float*)malloc(output_size*sizeof(float)); 344 | 345 | if (VERBOSE) printf("Convolution - kFFT\n"); 346 | 347 | //----------------> GPU kernel 348 | GPU_convolution_OLS_customFFT(h_input, h_output, h_filters_padded, signal_length, convolution_length, corrected_filter_length, past_filter_samples, nFilters, nRuns, h, offset_modifier, DEVICEID, &execution_time); 349 | CONV_cuFFT.GPU_time = execution_time; 350 | if(VERBOSE) printf(" Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time); 351 | if(VERBOSE) {cout << " All parameters: "; CONV_cuFFT.Print();} 352 | if(WRITE) CONV_cuFFT.Save(); 353 | //----------------> GPU kernel 354 | 355 | if(CHECK){ 356 | double total_error, mean_error; 357 | printf("Checking results...\n"); 358 | Full_CONV_check(h_output, h_input, h_filters, signal_length, filter_length, past_filter_samples, useful_part_size, (filter_length>>1), convolution_length, nConvolutions, nFilters, h, &total_error, &mean_error); 359 | //printf("Total error: %e; Mean error: %e\n", total_error, mean_error); 360 | } 361 | 362 | if (input_type == 'f') { 363 | Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file); 364 | } 365 | 366 | free(h_input); 367 | free(h_output); 368 | free(h_filters_padded); 369 | free(h_filters); 370 | 371 | cudaDeviceReset(); 372 | 373 | if (VERBOSE) printf("Finished!\n"); 374 | 375 | return (0); 376 | } 377 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/Makefile: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # CUDA_HOME are supposed to be on default position 3 | # and set it in your PATH .bashrc 4 | ############################################################### 5 | INC := -I${CUDA_HOME}/include 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda 7 | 8 | GCC = g++ 9 | NVCC = ${CUDA_HOME}/bin/nvcc 10 | 11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo 12 | 13 | GCC_OPTS =-O3 -Wall -Wextra $(INC) 14 | 15 | ANALYZE = CONV.exe 16 | 17 | 18 | ifdef reglim 19 | NVCCFLAGS += --maxrregcount=$(reglim) 20 | endif 21 | 22 | all: clean onefilter 23 | 24 | onefilter: CONV_SM_OLS_R2R.o CONV-32bit_customFFT.o Makefile 25 | $(NVCC) -o CONV.exe CONV_SM_OLS_R2R.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS) 26 | 27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h 28 | $(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS) 29 | 30 | CONV_SM_OLS_R2R.o: CONV_SM_OLS_R2R.cpp 31 | $(GCC) -c CONV_SM_OLS_R2R.cpp $(GCC_OPTS) 32 | 33 | clean: 34 | rm -f *.o *.~ CONV.exe 35 | 36 | 37 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/When_cuFFT_wins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV.exe; 4 | rm *.o; 5 | make reglim=0 > /dev/null 2>&1 6 | for convlength in 256 512 1024 2048 4096; 7 | do 8 | for tempsize in {64..4096..32} 9 | do 10 | ./CONV.exe r 2097152 $tempsize $convlength 32 20 11 | done 12 | done 13 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/benchmark_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm CONV_R2R_kFFT.dat; 4 | 5 | ./benchmark_performance.sh; 6 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_pp_perf.dat; 7 | 8 | ./When_cuFFT_wins.sh 9 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_pp_whencuFFTwins.dat; -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/benchmark_performance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | rm CONV.exe; 5 | rm *.o; 6 | make reglim=$reg > /dev/null 2>&1 7 | for convlength in 512 1024 2048 4096 8 | do 9 | for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073; 10 | do 11 | for templates in 2 4 8 16 32 64 96; 12 | do 13 | ./CONV.exe r 262144 $tempsize $convlength $templates 20 14 | ./CONV.exe r 524288 $tempsize $convlength $templates 20 15 | ./CONV.exe r 1048576 $tempsize $convlength $templates 20 16 | ./CONV.exe r 2097152 $tempsize $convlength $templates 20 17 | ./CONV.exe r 4194304 $tempsize $convlength $templates 20 18 | ./CONV.exe r 8388608 $tempsize $convlength $templates 20 19 | done 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/conv_check_R2R.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double max_error = 1.0e-4; 11 | 12 | 13 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){ 14 | for(int f=0; f=0 && signal_pos=(nTimesamples-1) ) { 48 | right = h_CPU_output_reduced[f*nTimesamples + nTimesamples - 1]; 49 | } 50 | else { 51 | right = h_CPU_output_reduced[pos+1]; 52 | } 53 | 54 | result = (left - right)/(2.0*h); 55 | h_CPU_postprocessed[pos] = result; 56 | } 57 | } 58 | } 59 | 60 | float get_error(float A, float B){ 61 | float error, div_error=10000, per_error=10000, order=0; 62 | int power; 63 | if(A<0) A = -A; 64 | if(B<0) B = -B; 65 | 66 | if (A>B) { 67 | div_error = A-B; 68 | if(B>10){ 69 | power = (int) log10(B); 70 | order = pow(10,power); 71 | div_error = div_error/order; 72 | } 73 | } 74 | else { 75 | div_error = B-A; 76 | if(A>10){ 77 | power = (int) log10(A); 78 | order = pow(10,power); 79 | div_error = div_error/order; 80 | } 81 | } 82 | 83 | if(div_error max_error ){ 100 | nErrors++; 101 | if(cislo<40){ 102 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size)); 103 | cislo++; 104 | } 105 | } 106 | } 107 | } 108 | mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y)); 109 | (*total_error) = total_error_l; 110 | (*mean_error) = mean_error_l; 111 | return(nErrors); 112 | } 113 | 114 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){ 115 | double total_error_l = 0, mean_error_l = 0; 116 | size_t nErrors = 0; 117 | int cislo = 0; 118 | float error; 119 | 120 | for(int y=0; y max_error ){ 132 | nErrors++; 133 | if(cislo<40){ 134 | printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size); 135 | cislo++; 136 | } 137 | } 138 | } 139 | } 140 | mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y)); 141 | (*total_error) = total_error_l; 142 | (*mean_error) = mean_error_l; 143 | return(nErrors); 144 | } 145 | 146 | 147 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){ 148 | float GPU_scale, CPU_scale; 149 | int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples; 150 | 151 | 152 | //----------------------- CPU time-domain 153 | size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters; 154 | float *h_CPU_output_timedomain; 155 | float *h_CPU_postprocessed; 156 | h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float)); 157 | h_CPU_postprocessed = (float *)malloc(output_size_timedomain*sizeof(float)); 158 | memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float)); 159 | memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float)); 160 | 161 | printf("\n--> Time-domain convolution:"); 162 | CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters); 163 | 164 | printf("\n--> Post-processing:\n"); 165 | CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h); 166 | 167 | #ifdef POST_PROCESS 168 | 169 | printf("\n--> Comparison to CPU time-domain with post-processing:\n"); 170 | GPU_scale = conv_length/2; 171 | GPU_scale = 1.0; 172 | CPU_scale = 1.0; 173 | GPU_offset = 0; 174 | CPU_offset = 0; 175 | GPU_dim_x = nConvolutions*useful_part_size; 176 | CPU_dim_x = (signal_length + filter_length - 1); 177 | nSamples = signal_length - offset; 178 | Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 179 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 180 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 181 | else printf("FAILED\n"); 182 | 183 | #else 184 | 185 | printf("\n--> Comparison to CPU time-domain:\n"); 186 | GPU_scale = conv_length/2; 187 | GPU_scale = 1.0; 188 | CPU_scale = 1.0; 189 | GPU_offset = 0; 190 | CPU_offset = 0; 191 | GPU_dim_x = nConvolutions*useful_part_size; 192 | CPU_dim_x = (signal_length + filter_length - 1); 193 | nSamples = signal_length; 194 | Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error); 195 | //printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error); 196 | if((*mean_error)<1.0e-4) printf("PASSED\n"); 197 | else printf("FAILED\n"); 198 | 199 | #endif 200 | 201 | free(h_CPU_output_timedomain); 202 | free(h_CPU_postprocessed); 203 | //-------------------------------------------------< 204 | 205 | 206 | 207 | 208 | 209 | } 210 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/debug.h: -------------------------------------------------------------------------------- 1 | #define VERBOSE true 2 | #define DEBUG false 3 | #define CHECK true 4 | #define WRITE true 5 | 6 | #define DEVICEID 0 7 | #define POST_PROCESS 8 | 9 | 10 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/params.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_R2R_sharedmemory_pp/params.h -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/results.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | class Performance_results{ 9 | public: 10 | double GPU_time; 11 | int nTimesamples; 12 | int template_length; 13 | int nTemplates; 14 | int nRuns; 15 | int reglim; 16 | int OaS_conv_size; 17 | int templates_per_block; 18 | char filename[200]; 19 | char kernel[10]; 20 | 21 | Performance_results() { 22 | GPU_time=0; 23 | } 24 | 25 | void Save(){ 26 | ofstream FILEOUT; 27 | FILEOUT.open (filename, std::ofstream::out | std::ofstream::app); 28 | FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 29 | FILEOUT.close(); 30 | } 31 | 32 | void Print(){ 33 | cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl; 34 | } 35 | 36 | void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){ 37 | nTimesamples = t_nTimesamples; 38 | template_length = t_template_length; 39 | nTemplates = t_nTemplates; 40 | nRuns = t_nRuns; 41 | reglim = t_reglim; 42 | OaS_conv_size = t_OaS_conv_size; 43 | templates_per_block = t_templates_per_block; 44 | sprintf(filename,"%s", t_filename); 45 | sprintf(kernel,"%s",t_kernel); 46 | } 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/run_convolution.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --job-name=AAFFT_R2R_conv 6 | #SBATCH --partition=htc 7 | #SBATCH --gres=gpu:1 --constraint='gpu_sku:P100' 8 | 9 | module load gpu/cuda/10.0.130 10 | 11 | ./benchmark_all.sh 12 | 13 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /GPU_OLS_R2R_sharedmemory_pp/utils_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Karel Adámek 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /OLS_generate_files/Example_files.cpp: -------------------------------------------------------------------------------- 1 | //******************************************************************************************** 2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 3 | //* Copyright (C) 2017 Adámek Karel 4 | //* 5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 6 | //******************************************************************************************** 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | struct float2 { 19 | float x; 20 | float y; 21 | }; 22 | 23 | void Generate_signal(float2 *h_input, int nTimesamples){ 24 | for(int f=0; fnSamples) boxcar_width=nSamples; 62 | for(int f=0; f=(nSamples/2-boxcar_width/2) && f<( nSamples/2+boxcar_width/2) ){ 64 | h_templates[t*nSamples + f].x=1; 65 | h_templates[t*nSamples + f].y=0; 66 | } 67 | else { 68 | h_templates[t*nSamples + f].x=0; 69 | h_templates[t*nSamples + f].y=0; 70 | } 71 | } 72 | } 73 | } 74 | 75 | 76 | 77 | int GPU_CONV(float2 *h_input_signal, float2 *h_output_plane_reduced, float2 *h_templates, int useful_part_size, int offset, int template_length, int nConvolutions, int nTemplates, int nRuns, double *execution_time); 78 | int GPU_CONV_debug(float2 *h_input_signal, float2 *h_GPU_input_signal_extended, float2 *h_GPU_input_signal_extended_FFT, float2 *h_output_plane, float2 *h_output_plane_IFFT, float2 *h_output_plane_reduced, float2 *h_templates, int useful_part_size, int offset, int template_length, int nConvolutions, int nTemplates, int nRuns, double *execution_time); 79 | 80 | 81 | int main(int argc, char* argv[]) { 82 | int nTimesamples; 83 | int template_length; 84 | int nTemplates; 85 | char filter_file[100]; 86 | char signal_file[100]; 87 | 88 | char * pEnd; 89 | if (argc==6) { 90 | nTimesamples = strtol(argv[1],&pEnd,10); 91 | template_length = strtol(argv[2],&pEnd,10); 92 | nTemplates = strtol(argv[3],&pEnd,10); 93 | if (strlen(argv[4])>100) {printf("Filename of input signal file is too long\n"); exit(2);} 94 | sprintf(signal_file,"%s",argv[4]); 95 | if (strlen(argv[5])>100) {printf("Filename of input filter file is too long\n"); exit(2);} 96 | sprintf(filter_file,"%s",argv[5]); 97 | } 98 | else { 99 | printf("Argument error!\n"); 100 | printf(" 1) Signal length in number of time samples (min 15000 samples)\n"); 101 | printf(" 2) Filter length Example:129\n"); 102 | printf(" 3) Number of filters\n"); 103 | printf(" 4) Name of the file to export signal to\n"); 104 | printf(" 5) Name of the file to export filters to\n"); 105 | return 1; 106 | } 107 | 108 | if (nTimesamples<15000) {printf("Number of samples must be higher then 15000 samples\n"); exit(1);} 109 | 110 | size_t input_size = nTimesamples; 111 | size_t template_size_time = nTemplates*template_length; 112 | 113 | float2 *h_input_signal; 114 | float2 *h_templates; 115 | 116 | h_input_signal = (float2 *)malloc(input_size*sizeof(float2)); 117 | h_templates = (float2 *)malloc(template_size_time*sizeof(float2)); 118 | 119 | memset(h_input_signal, 0.0, input_size*sizeof(float2)); 120 | memset(h_templates, 0.0, template_size_time*sizeof(float2)); 121 | 122 | Generate_signal(h_input_signal, nTimesamples); 123 | Generate_templates(h_templates, template_length, nTemplates); 124 | 125 | 126 | std::ofstream FILEOUT; 127 | FILEOUT.open(signal_file); 128 | for(int ts=0; ts 0){ 38 | best_line <- which.min(slengthdata[[4]]); 39 | bestAAFFTperformance<-rbind(bestAAFFTperformance,slengthdata[best_line,]); 40 | } 41 | } 42 | } 43 | } 44 | #rm(AAFFTdata); 45 | #--------------------------------------------------< 46 | 47 | #------------------------------------------------- 48 | #Process cuFFT performance data 49 | bestcuFFTperformance <- cuFFTdata[1,]; bestcuFFTperformance <- bestcuFFTperformance[-1,]; 50 | 51 | #Find best performing configuration for each case (template width, convolution size, ...) 52 | for (nLenght in template_lengths){ 53 | templengthdata <- cuFFTdata[(cuFFTdata[[2]]==nLenght),]; 54 | for (nTemplates in number_of_templates){ 55 | templatedata <- templengthdata[(templengthdata[[3]]==nTemplates),]; 56 | for (slength in signal_lengths){ 57 | slengthdata<-templatedata[(templatedata[[1]]==slength),]; 58 | if (length(slengthdata[,1]) > 0){ 59 | best_line <- which.min(slengthdata[[4]]); 60 | bestcuFFTperformance<-rbind(bestcuFFTperformance,slengthdata[best_line,]); 61 | } 62 | } 63 | } 64 | } 65 | #rm(cuFFTdata); 66 | #--------------------------------------------------< 67 | 68 | 69 | #writing best performance grouped by filter length 70 | if(export_best_performance==1){ 71 | for (nLenght in template_lengths){ 72 | templengthdata<-bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),]; 73 | for (slength in signal_lengths){ 74 | slengthdata<-templengthdata[(templengthdata[[1]]==slength),]; 75 | filename="Best_kFFT"; 76 | filename<-paste(filename, slength, nLenght, sep="_"); 77 | filename<-paste(filename, output_extension, sep=""); 78 | unlink(filename); 79 | write.table(slengthdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE); 80 | } 81 | } 82 | 83 | #writing out the results 84 | extension=".txt"; 85 | for (nLenght in template_lengths){ 86 | templengthdata<-bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),]; 87 | for (slength in signal_lengths){ 88 | slengthdata<-templengthdata[(templengthdata[[1]]==slength),]; 89 | filename="Best_cuFFT"; 90 | filename<-paste(filename, slength, nLenght, sep="_"); 91 | filename<-paste(filename, output_extension, sep=""); 92 | unlink(filename); 93 | write.table(slengthdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE); 94 | } 95 | } 96 | } 97 | 98 | 99 | #export data grouped by template width 100 | if(export_data_grouped_by_template_width==1){ 101 | for (nLenght in template_lengths){ 102 | AAFFTdata_fixedTemplate <- bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),]; 103 | #set up container 104 | AAFFTdata_temp<-AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[1]]==signal_lengths[[1]]),]; 105 | resultdata<-cbind(AAFFTdata_temp[[3]]); 106 | #add columns with signal length and time for each signal length 107 | for (slength in signal_lengths){ 108 | AAFFTdata_temp<-templengthdata[(templengthdata[[1]]==slength),]; 109 | if (length(AAFFTdata_temp[,1]) > 0){ 110 | resultdata<-cbind(resultdata, AAFFTdata_temp[[1]], AAFFTdata_temp[[4]]); 111 | } 112 | } 113 | #export data 114 | filename="AAFFT_results"; 115 | filename<-paste(filename, nLenght, sep="_"); 116 | filename<-paste(filename, output_extension, sep=""); 117 | unlink(filename); 118 | write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE); 119 | } 120 | 121 | for (nLenght in template_lengths){ 122 | cuFFTdata_fixedTemplate <- bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),]; 123 | #set up container 124 | cuFFTdata_temp<-cuFFTdata_fixedTemplate[(cuFFTdata_fixedTemplate[[1]]==signal_lengths[[1]]),]; 125 | resultdata<-cbind(cuFFTdata_temp[[3]]); 126 | #add columns with signal length and time for each signal length 127 | for (slength in signal_lengths){ 128 | cuFFTdata_temp<-templengthdata[(templengthdata[[1]]==slength),]; 129 | if (length(cuFFTdata_temp[,1]) > 0){ 130 | resultdata<-cbind(resultdata, cuFFTdata_temp[[1]], cuFFTdata_temp[[4]]); 131 | } 132 | } 133 | #export data 134 | filename="cuFFT_results"; 135 | filename<-paste(filename, nLenght, sep="_"); 136 | filename<-paste(filename, output_extension, sep=""); 137 | unlink(filename); 138 | write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE); 139 | } 140 | } 141 | 142 | 143 | #export data time and speedup vs nTemplates grouped by template width, both AAFFT and cuFFT in one file 144 | if(export_nTemplates_time_speedup_gr_template_width_one_file==1){ 145 | for (nLenght in template_lengths){ 146 | AAFFTdata_fixedTemplate <- bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),]; 147 | cuFFTdata_fixedTemplate <- bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),]; 148 | 149 | #set up container 150 | AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[1]]==signal_lengths[[1]]),]; 151 | resultdata<-cbind(AAFFTdata_temp[[3]]); 152 | #add columns with signal length and time for each signal length 153 | for (slength in signal_lengths){ 154 | AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[1]]==slength),]; 155 | cuFFTdata_temp <- cuFFTdata_fixedTemplate[(cuFFTdata_fixedTemplate[[1]]==slength),]; 156 | #Creating dataframe with results 157 | resultdata<-cbind(resultdata, AAFFTdata_temp[[1]], cuFFTdata_temp[[4]], AAFFTdata_temp[[4]], cuFFTdata_temp[[4]]/AAFFTdata_temp[[4]]); 158 | } 159 | #export data 160 | filename="Results_TitanV_cuFFT_callbacks_R2R_speedup"; 161 | filename<-paste(filename,nLenght,sep="_"); 162 | filename<-paste(filename,output_extension,sep=""); 163 | unlink(filename); 164 | write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE); 165 | } 166 | } 167 | rm(resultdata) 168 | 169 | #export data time and speedup vs signal length grouped by number of templates, both AAFFT and cuFFT in one file 170 | if(export_signal_length_time_speedup_gr_nTemplates_one_file==1){ 171 | for (nLenght in template_lengths){ 172 | AAFFTdata_fixedTemplate <- bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),]; 173 | cuFFTdata_fixedTemplate <- bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),]; 174 | 175 | #set up container 176 | AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[3]]==number_of_templates[[3]]),]; 177 | signal_length <- cbind(AAFFTdata_temp[[1]]); 178 | resultdata <- cbind(AAFFTdata_temp[[1]]); 179 | #add columns with signal length and time for each signal length 180 | for (nTemplates in number_of_templates){ 181 | temporaryresults <- signal_length 182 | AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[3]]==nTemplates),]; 183 | cuFFTdata_temp <- cuFFTdata_fixedTemplate[(cuFFTdata_fixedTemplate[[3]]==nTemplates),]; 184 | #Creating dataframe with results 185 | temporaryresults <-merge(temporaryresults, AAFFTdata_temp, by.x='V1', by.y='V1', all.x=TRUE, all.y=FALSE) 186 | temporaryresults <-merge(temporaryresults, cuFFTdata_temp, by.x='V1', by.y='V1', all.x=TRUE, all.y=FALSE) 187 | local_results<-cbind(temporaryresults[[3]], temporaryresults[[12]], temporaryresults[[4]], temporaryresults[[12]]/temporaryresults[[4]]); 188 | resultdata<-cbind(resultdata, local_results); 189 | rm(temporaryresults); 190 | } 191 | #export data 192 | filename="Results_TitanV_cuFFT_callbacks_R2R_speedup_signal_length"; 193 | filename<-paste(filename,nLenght,sep="_"); 194 | filename<-paste(filename,output_extension,sep=""); 195 | unlink(filename); 196 | write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE); 197 | } 198 | } 199 | --------------------------------------------------------------------------------