├── GPU_OLS_C2C_cuFFT_callbacks
    ├── CONV-32bit_cuFFT.cu
    ├── CONV_C2C.cpp
    ├── Makefile
    ├── When_cuFFT_wins_cuFFT.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_C2C_cuFFT_callbacks_pp
    ├── CONV-32bit_cuFFT.cu
    ├── CONV_C2C.cpp
    ├── Makefile
    ├── When_cuFFT_wins_cuFFT.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_C2C_sharedmemory
    ├── CONV-32bit_customFFT.cu
    ├── CONV_SM_OLS_C2C.cpp
    ├── Makefile
    ├── When_cuFFT_wins.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_C2C_sharedmemory_pp
    ├── CONV-32bit_customFFT.cu
    ├── CONV_SM_OLS_C2C.cpp
    ├── Makefile
    ├── When_cuFFT_wins.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_R2R_cuFFT_callbacks
    ├── CONV-32bit_cuFFT.cu
    ├── CONV_R2R.cpp
    ├── Makefile
    ├── When_cuFFT_wins_cuFFT.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check_R2R.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_R2R_cuFFT_callbacks_pp
    ├── CONV-32bit_cuFFT.cu
    ├── CONV_R2R.cpp
    ├── Makefile
    ├── When_cuFFT_wins_cuFFT.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check_R2R.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_R2R_sharedmemory
    ├── CONV-32bit_customFFT.cu
    ├── CONV_SM_OLS_R2R.cpp
    ├── Makefile
    ├── When_cuFFT_wins.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check_R2R.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── timer.h
    └── utils_cuda.h
├── GPU_OLS_R2R_sharedmemory_pp
    ├── CONV-32bit_customFFT.cu
    ├── CONV_SM_OLS_R2R.cpp
    ├── Makefile
    ├── When_cuFFT_wins.sh
    ├── benchmark_all.sh
    ├── benchmark_performance.sh
    ├── conv_check_R2R.h
    ├── debug.h
    ├── params.h
    ├── results.h
    ├── run_convolution.sh
    ├── timer.h
    └── utils_cuda.h
├── LICENSE
├── OLS_generate_files
    ├── Example_files.cpp
    └── Makefile
├── README.md
└── process_AAFFT_results.R


/GPU_OLS_C2C_cuFFT_callbacks/CONV_C2C.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | 
  9 | #include "debug.h"
 10 | #include "params.h"
 11 | #include "results.h"
 12 | 
 13 | #include <stdio.h>
 14 | #include <string.h>
 15 | #include <math.h>
 16 | #include <time.h>
 17 | #include <stdlib.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_runtime_api.h>
 21 | 
 22 | #include "conv_check.h"
 23 | 
 24 | void Generate_signal(float2 *h_input, int nTimesamples){
 25 | 	for(int f=0; f<nTimesamples; f++){
 26 | 		h_input[f].x=rand() / (float)RAND_MAX;
 27 | 		h_input[f].y=rand() / (float)RAND_MAX;
 28 | 	}
 29 | }
 30 | 
 31 | void Generate_templates(float2 *h_filters, int nSamples, int nFilters){
 32 | 	for(int t=0; t<nFilters; t++){
 33 | 		for(int f=0; f<nSamples; f++){
 34 | 			h_filters[t*nSamples + f].x=rand() / (float)RAND_MAX;
 35 | 			h_filters[t*nSamples + f].y=rand() / (float)RAND_MAX;
 36 | 		}
 37 | 	}
 38 | }
 39 | 
 40 | void Pad_templates(float2 *h_filters_time, float2 *h_filters, int template_size, int convolution_size, int nFilters){
 41 | 	for(int f=0; f<nFilters*convolution_size; f++){
 42 | 		h_filters[f].x=0;
 43 | 		h_filters[f].y=0;
 44 | 	}
 45 | 	
 46 | 	for(int t=0; t<nFilters; t++){
 47 | 		for(int f=0; f<template_size; f++){
 48 | 			// padding for centered filter
 49 | 			if(f>=template_size/2) {
 50 | 				h_filters[t*convolution_size + f - template_size/2].x=h_filters_time[t*template_size + f].x;
 51 | 				h_filters[t*convolution_size + f - template_size/2].y=h_filters_time[t*template_size + f].y;
 52 | 			}
 53 | 			else if(f<template_size/2) {
 54 | 				h_filters[t*convolution_size + f + convolution_size - template_size/2].x = h_filters_time[t*template_size + f].x;
 55 | 				h_filters[t*convolution_size + f + convolution_size - template_size/2].y = h_filters_time[t*template_size + f].y;
 56 | 			}
 57 | 		}
 58 | 	}
 59 | }
 60 | 
 61 | 
 62 | int Write_output(float2 *h_output, int nTimesamples, int nFilters, char *output_signal_file){
 63 | 	int error=0;
 64 | 	ofstream FILEOUT;
 65 | 	FILEOUT.open(output_signal_file);
 66 | 	if (!FILEOUT.fail()){
 67 | 		if (VERBOSE) printf("Writing output\n");
 68 | 		for(int f=0; f<nFilters; f++){
 69 | 			if (VERBOSE) printf("[");
 70 | 			for(int Ts=0;Ts<nTimesamples;Ts++){
 71 | 				if(Ts%100000==0) {
 72 | 					if (VERBOSE) {
 73 | 						printf(".");
 74 | 						fflush(stdout);
 75 | 					}
 76 | 				}
 77 | 				FILEOUT << f << " " << Ts << " " << h_output[f*nTimesamples+Ts].x << " " << h_output[f*nTimesamples+Ts].y << endl;
 78 | 			}
 79 | 			FILEOUT << endl;
 80 | 			if (VERBOSE) printf("] filter=%d\n",f);
 81 | 		}
 82 | 	}
 83 | 	else {
 84 | 		cout << "Write to a file failed!" << endl;
 85 | 		error++;
 86 | 	}
 87 | 	FILEOUT.close();
 88 | 	return(error);
 89 | }
 90 | 
 91 | 
 92 | long int File_size_row_signal(ifstream &FILEIN){
 93 | 	std::size_t count=0;
 94 | 	FILEIN.seekg(0,ios::beg);
 95 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
 96 | 	return((long int)count);
 97 | }
 98 | 
 99 | 
100 | int Load_signal(char *filename, int *nSamples, float2 **data){
101 | 	float real, imaginary;
102 | 	int file_size, cislo, error;
103 | 	error=0;
104 | 
105 | 	ifstream FILEIN;
106 | 	FILEIN.open(filename,ios::in);
107 | 	if (!FILEIN.fail()){
108 | 		error=0;
109 | 		file_size=File_size_row_signal(FILEIN);
110 | 		(*nSamples) = file_size;
111 | 		printf("nSamples:%d;\n", (*nSamples) );
112 | 
113 | 		if(file_size>0){
114 | 			*data = (float2*)malloc(file_size*sizeof(float2));
115 | 			memset( (*data), 0.0, file_size*sizeof(float2));
116 | 			if(*data==NULL){
117 | 				printf("\nAllocation error!\n");
118 | 				error++;
119 | 			}
120 | 		
121 | 			FILEIN.clear();
122 | 			FILEIN.seekg(0,ios::beg);
123 | 			
124 | 			for (cislo=0; cislo < file_size; cislo++) {
125 | 				FILEIN >> real >> imaginary;
126 | 				(*data)[cislo].x = real;
127 | 				(*data)[cislo].y = imaginary;
128 | 			}
129 | 		}
130 | 		else {
131 | 			printf("\nFile is void of any content!\n");
132 | 			error++;
133 | 		}
134 | 	}
135 | 	else {
136 | 		cout << "File not found -> " << filename << " <-" << endl;
137 | 		error++;
138 | 	}
139 | 	FILEIN.close();
140 | 	return(error);
141 | }
142 | 
143 | 
144 | int Load_filters(char *filename, int *nFilters, int *filter_length, float2 **data){
145 | 	float real, imaginary;
146 | 	int file_size, cislo, error, filter_size;
147 | 	error=0;
148 | 
149 | 	ifstream FILEIN;
150 | 	FILEIN.open(filename,ios::in);
151 | 	if (!FILEIN.fail()){
152 | 		error=0;
153 | 		file_size = File_size_row_signal(FILEIN);
154 | 		(*filter_length) = file_size/(*nFilters);
155 | 		filter_size = (*nFilters)*(*filter_length);
156 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
157 | 
158 | 		if(file_size>0){
159 | 			*data = (float2*)malloc( filter_size*sizeof(float2));
160 | 			memset( (*data), 0.0, filter_size*sizeof(float2));
161 | 			
162 | 			if(*data==NULL){
163 | 				printf("\nAllocation error!\n");
164 | 				error++;
165 | 			}
166 | 		
167 | 			FILEIN.clear();
168 | 			FILEIN.seekg(0,ios::beg);
169 | 
170 | 			for (cislo=0; cislo < filter_size; cislo++) {
171 | 				FILEIN >> real >> imaginary;
172 | 				(*data)[cislo].x = real;
173 | 				(*data)[cislo].y = imaginary;
174 | 			}
175 | 		}
176 | 		else {
177 | 			printf("\nFile is void of any content!\n");
178 | 			error++;
179 | 		}
180 | 	}
181 | 	else {
182 | 		cout << "File not found -> " << filename << " <-" << endl;
183 | 		error++;
184 | 	}
185 | 	FILEIN.close();
186 | 	return(error);
187 | }
188 | 
189 | 
190 | int GPU_CONV(float2 *h_input, float2 *h_output, float2 *h_filters, int signal_length, int filter_length, int nFilters, int nRuns, double *execution_time);
191 | 
192 | 
193 | int main(int argc, char* argv[]) {
194 | 	int nTimesamples;		// input signal length
195 | 	int filter_length;		// filter length
196 | 	int nFilters;			// number of filters
197 | 	int nRuns;
198 | 	char input_type='0';
199 | 	char input_filter_file[255];
200 | 	char input_signal_file[255];
201 | 	char output_signal_file[255];
202 | 	
203 | 	char * pEnd;
204 | 	if (argc>2) {
205 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
206 | 		input_type=*argv[1];
207 | 	}
208 | 	if (input_type == 'f' && argc==6) {
209 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
210 | 		sprintf(input_signal_file,"%s",argv[2]);
211 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
212 | 		sprintf(input_filter_file,"%s",argv[3]);
213 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
214 | 		sprintf(output_signal_file,"%s",argv[4]);
215 | 		nFilters = strtol(argv[5],&pEnd,10);
216 | 		nRuns = 1;
217 | 	}
218 | 	else if (input_type == 'r' && argc==6) {	
219 | 		nTimesamples  = strtol(argv[2],&pEnd,10);
220 | 		filter_length = strtol(argv[3],&pEnd,10);
221 | 		nFilters      = strtol(argv[4],&pEnd,10);
222 | 		nRuns         = strtol(argv[5],&pEnd,10);
223 | 	}
224 | 	else {
225 | 		printf("Parameters error!\n");
226 | 		printf(" 1) Input type: 'r' or 'f' \n");
227 | 		printf("----------------------------------\n");
228 | 		printf("'f' - file input provided by user\n");
229 | 		printf(" 2) Input signal file\n");
230 | 		printf(" 3) Input filter file\n");
231 | 		printf(" 4) Output signal file\n");
232 | 		printf(" 5) number of filters\n");
233 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n");
234 | 		printf("----------------------------------\n");
235 | 		printf(" 'r' - random input generated by the code\n");
236 | 		printf(" 2) Signal length in number of time samples\n");
237 | 		printf(" 3) Filter length in samples\n");
238 | 		printf(" 4) Number of templates\n");
239 | 		printf(" 5) number of GPU kernel runs\n");
240 | 		printf(" Example: CONV.exe r 2097152 193 32 10\n");
241 |         return 1;
242 | 	}
243 | 	
244 | 	if (DEBUG) {
245 | 		printf("Parameters:\n");
246 | 		printf("Input signal and templates are ");
247 | 		if (input_type == 'r') {
248 | 			printf("randomly generated.\n");
249 | 			printf("Signal length:     %d samples\n", nTimesamples);
250 | 			printf("Filter length:     %d samples\n", filter_length);
251 | 			printf("Number of filters: %d\n", nFilters);
252 | 			printf("nRuns:             %d\n", nRuns);
253 | 		}
254 | 		if (input_type == 'f') {
255 | 			printf("read from file.\n");
256 | 			printf("Input signal:  %s\n", input_signal_file);
257 | 			printf("Input filter:  %s\n", input_filter_file);
258 | 			printf("Output signal: %s\n", output_signal_file);
259 | 			printf("nFilters:      %d\n", nFilters);
260 | 			printf("nRuns:         %d\n", nRuns);
261 | 			printf("-----------------\n");
262 | 		}
263 | 	}
264 | 
265 | 	float2 *h_input;			// input signal
266 | 	float2 *h_output;			// output plane
267 | 	float2 *h_filters_padded;	// filters in time-domain padded with zeroes
268 | 	float2 *h_filters;		    // filters in time-domain
269 | 	
270 | 	if (input_type == 'f') {
271 | 		int error=0;
272 | 		error += Load_signal(input_signal_file, &nTimesamples, &h_input);
273 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
274 | 		if( error>0 ){exit(1);}
275 | 		else if (VERBOSE) printf("File loaded\n");
276 | 	}
277 | 	
278 | 	if (input_type == 'r') {
279 | 		h_input          = (float2 *)malloc(nTimesamples*sizeof(float2));
280 | 		h_filters	     = (float2 *)malloc(filter_length*nFilters*sizeof(float2));
281 | 		srand(time(NULL));
282 | 		Generate_signal(h_input, nTimesamples);
283 | 		Generate_templates(h_filters, filter_length, nFilters);
284 | 		if (VERBOSE) printf("Signal and filters generated\n");
285 | 	}
286 | 	
287 | 	size_t filter_size_padded = nFilters*CONV_SIZE;
288 | 	h_filters_padded = (float2*)malloc(filter_size_padded*sizeof(float2));
289 | 	Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters);
290 | 	
291 | 	//----------------> Results
292 | 	double execution_time = 0;
293 | 	Performance_results CONV_cuFFT;
294 | 	CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT");
295 | 	
296 | 	int offset           = filter_length/2; // we assume that filter is centered around zero
297 | 	int useful_part_size = CONV_SIZE - filter_length + 1;
298 | 	int nConvolutions    = (nTimesamples + useful_part_size - 1)/useful_part_size;
299 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
300 | 	if(DEBUG) {
301 | 		printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions);
302 | 	}
303 | 	
304 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
305 | 	h_output = (float2*)malloc(output_size*sizeof(float2));
306 | 	
307 | 	if (VERBOSE) printf("Convolution - cuFFT\n");
308 | 
309 | 	//----------------> GPU kernel
310 | 	int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, nRuns, &execution_time);
311 | 	CONV_cuFFT.GPU_time = execution_time;
312 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
313 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
314 | 	if(WRITE && GPU_error==0) CONV_cuFFT.Save();
315 | 	//----------------> GPU kernel
316 | 	
317 | 	if(CHECK){
318 | 		double total_error, mean_error;
319 | 		printf("Checking results...\n");
320 | 		Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, &total_error, &mean_error);
321 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
322 | 	}
323 | 	
324 | 	if (input_type == 'f') {
325 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
326 | 	}
327 | 	
328 | 	free(h_input);
329 | 	free(h_output);
330 | 	free(h_filters_padded);
331 | 	free(h_filters);
332 | 
333 | 	cudaDeviceReset();
334 | 
335 | 	if (VERBOSE) printf("Finished!\n");
336 | 
337 | 	return (0);
338 | }
339 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3  -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | ifdef reglim
18 | NVCCFLAGS += --maxrregcount=$(reglim)
19 | endif
20 | 
21 | all: clean analyze
22 | 
23 | analyze: CONV_C2C.o CONV-32bit_cuFFT.o Makefile
24 | 	$(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_C2C.o $(LIB) $(NVCCFLAGS) 
25 | 
26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h
27 | 	$(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64
28 | 
29 | CONV_C2C.o: CONV_C2C.cpp
30 | 	$(GCC) -c CONV_C2C.cpp $(GCC_OPTS)
31 | 
32 | clean:	
33 | 	rm -f *.o *.~ $(ANALYZE)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/When_cuFFT_wins_cuFFT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 	echo "#define CONV_SIZE $convsize" > params.h
 6 | 	
 7 | 	rm CONV.exe
 8 | 	make
 9 | 	for tempsize in {64..2048..32}
10 | 	do
11 | 		for templates in 32;
12 | 		do
13 | 			./CONV.exe r 2097152 $tempsize $templates 20 0
14 | 		done
15 | 	done
16 | done
17 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_cuFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_cuFFT.dat OLS_cuFFT_C2C_perf.dat;
7 | 
8 | ./When_cuFFT_wins_cuFFT.sh
9 | mv CONV_cuFFT.dat OLS_cuFFT_C2C_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 
 6 | echo "#define CONV_SIZE $convsize" > params.h		
 7 | rm CONV.exe
 8 | make
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do	
11 | 		for templates in 2 4 8 11 16 32 51 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $templates 20
14 | 			./CONV.exe r 524288 $tempsize $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $templates 20
19 | 		done
20 | 	done
21 | done
22 | 
23 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/conv_check.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | 
 13 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int nFilters){
 14 | 	for(int f=0; f<nFilters; f++){
 15 | 		printf(".");  fflush(stdout);
 16 | 		for(int s=0; s<signal_length; s++){
 17 | 			float2 ac;
 18 | 			ac.x = 0; ac.y = 0;
 19 | 			for(int i=0; i<filter_length; i++){
 20 | 				int filter_pos = filter_length - 1 - i;
 21 | 				float2 fv, sv;
 22 | 				fv = h_filters[f*filter_length + filter_pos];
 23 | 				int signal_pos = (s + i - (filter_length>>1));
 24 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 25 | 				else {sv.x = 0; sv.y = 0;}
 26 | 				ac.x = ac.x + sv.x*fv.x - sv.y*fv.y;
 27 | 				ac.y = ac.y + sv.x*fv.y + sv.y*fv.x;
 28 | 			}
 29 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 30 | 		}
 31 | 	}
 32 | 	printf("\n");
 33 | }
 34 | 
 35 | 
 36 | float get_error(float2 A_f2, float2 B_f2){
 37 | 	float error, div_error=10000, per_error=10000, order=0;
 38 | 	int power;
 39 | 	float A = max(A_f2.x, A_f2.y);
 40 | 	float B = max(B_f2.x, B_f2.y);
 41 | 	if(A<0) A = -A;
 42 | 	if(B<0) B = -B;
 43 | 	
 44 | 	if (A>B) {
 45 | 		div_error = A-B;
 46 | 		if(B>10){
 47 | 			power = (int) log10(B);
 48 | 			order = pow(10,power);
 49 | 			div_error = div_error/order;
 50 | 		}
 51 | 	}
 52 | 	else {
 53 | 		div_error = B-A;
 54 | 		if(A>10){
 55 | 			power = (int) log10(A);
 56 | 			order = pow(10,power);
 57 | 			div_error = div_error/order;
 58 | 		}
 59 | 	}
 60 | 	
 61 | 	if(div_error<per_error) error = div_error;
 62 | 	else error = per_error;
 63 | 	return(error);
 64 | }
 65 | 
 66 | 
 67 | int Compare_data(float2 *CPU_result, float2 *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
 68 | 	double total_error_l = 0, mean_error_l = 0;
 69 | 	size_t nErrors = 0;
 70 | 	int cislo = 0;
 71 | 	float error;
 72 | 	
 73 | 	for(int y=0; y<dim_y; y++){
 74 | 		for(int x=0; x<nSamples; x++){
 75 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
 76 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
 77 | 			float2 CPU, GPU;
 78 | 			CPU.x = CPU_result[CPU_pos].x/CPU_scale; CPU.y = CPU_result[CPU_pos].y/CPU_scale;
 79 | 			GPU.x = GPU_result[GPU_pos].x/GPU_scale; GPU.y = GPU_result[GPU_pos].y/GPU_scale;
 80 | 			
 81 | 			
 82 | 			error = get_error(CPU, GPU);
 83 | 			total_error_l = total_error_l + error;
 84 | 			if( error > max_error ){
 85 | 				nErrors++;
 86 | 				if(cislo<40){
 87 | 					printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) (x/useful_part_size), x%useful_part_size);
 88 | 					cislo++;
 89 | 				}
 90 | 			}
 91 | 		}
 92 | 	}
 93 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
 94 | 	(*total_error) = total_error_l;
 95 | 	(*mean_error) = mean_error_l;
 96 | 	return(nErrors);
 97 | }
 98 | 
 99 | 
100 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){
101 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
102 | 	float2 *h_CPU_output_timedomain;
103 | 	h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2));
104 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2));
105 | 	
106 | 	printf("\n--> Time-domain convolution:");
107 | 	CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters);
108 | 		
109 | 	float GPU_scale, CPU_scale;
110 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
111 | 	
112 | 	printf("\n--> Comparison to CPU time-domain:\n");
113 | 	GPU_scale = conv_length;
114 | 	CPU_scale = 1.0;
115 | 	GPU_offset = 0;
116 | 	CPU_offset = 0;
117 | 	GPU_dim_x = nConvolutions*useful_part_size;
118 | 	CPU_dim_x = (signal_length + filter_length - 1);
119 | 	nSamples = signal_length - offset;	
120 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
121 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
122 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
123 | 	else printf("FAILED\n");
124 | 	
125 | 	free(h_CPU_output_timedomain);
126 | }
127 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/debug.h:
--------------------------------------------------------------------------------
1 | #define VERBOSE true
2 | #define DEBUG false
3 | #define CHECK false
4 | #define WRITE true
5 | 
6 | #define DEVICEID 0
7 | 
8 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/params.h:
--------------------------------------------------------------------------------
1 | #define CONV_SIZE 8192
2 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/CONV_C2C.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | 
  9 | #include "debug.h"
 10 | #include "params.h"
 11 | #include "results.h"
 12 | 
 13 | #include <stdio.h>
 14 | #include <string.h>
 15 | #include <math.h>
 16 | #include <time.h>
 17 | #include <stdlib.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_runtime_api.h>
 21 | 
 22 | #include "conv_check.h"
 23 | 
 24 | void Generate_signal(float2 *h_input, int nTimesamples){
 25 | 	for(int f=0; f<nTimesamples; f++){
 26 | 		h_input[f].x=rand() / (float)RAND_MAX;
 27 | 		h_input[f].y=rand() / (float)RAND_MAX;
 28 | 	}
 29 | }
 30 | 
 31 | void Generate_templates(float2 *h_filters, int nSamples, int nFilters){
 32 | 	for(int t=0; t<nFilters; t++){
 33 | 		for(int f=0; f<nSamples; f++){
 34 | 			h_filters[t*nSamples + f].x=rand() / (float)RAND_MAX;
 35 | 			h_filters[t*nSamples + f].y=rand() / (float)RAND_MAX;
 36 | 		}
 37 | 	}
 38 | }
 39 | 
 40 | void Pad_templates(float2 *h_filters_time, float2 *h_filters, int template_size, int convolution_size, int nFilters){
 41 | 	for(int f=0; f<nFilters*convolution_size; f++){
 42 | 		h_filters[f].x=0;
 43 | 		h_filters[f].y=0;
 44 | 	}
 45 | 	
 46 | 	for(int t=0; t<nFilters; t++){
 47 | 		for(int f=0; f<template_size; f++){
 48 | 			// padding for centered filter
 49 | 			if(f>=template_size/2) {
 50 | 				h_filters[t*convolution_size + f - template_size/2].x=h_filters_time[t*template_size + f].x;
 51 | 				h_filters[t*convolution_size + f - template_size/2].y=h_filters_time[t*template_size + f].y;
 52 | 			}
 53 | 			else if(f<template_size/2) {
 54 | 				h_filters[t*convolution_size + f + convolution_size - template_size/2].x = h_filters_time[t*template_size + f].x;
 55 | 				h_filters[t*convolution_size + f + convolution_size - template_size/2].y = h_filters_time[t*template_size + f].y;
 56 | 			}
 57 | 		}
 58 | 	}
 59 | }
 60 | 
 61 | 
 62 | int Write_output(float2 *h_output, int nTimesamples, int nFilters, char *output_signal_file){
 63 | 	int error=0;
 64 | 	ofstream FILEOUT;
 65 | 	FILEOUT.open(output_signal_file);
 66 | 	if (!FILEOUT.fail()){
 67 | 		if (VERBOSE) printf("Writing output\n");
 68 | 		for(int f=0; f<nFilters; f++){
 69 | 			if (VERBOSE) printf("[");
 70 | 			for(int Ts=0;Ts<nTimesamples;Ts++){
 71 | 				if(Ts%100000==0) {
 72 | 					if (VERBOSE) {
 73 | 						printf(".");
 74 | 						fflush(stdout);
 75 | 					}
 76 | 				}
 77 | 				FILEOUT << f << " " << Ts << " " << h_output[f*nTimesamples+Ts].x << " " << h_output[f*nTimesamples+Ts].y << endl;
 78 | 			}
 79 | 			FILEOUT << endl;
 80 | 			if (VERBOSE) printf("] filter=%d\n",f);
 81 | 		}
 82 | 	}
 83 | 	else {
 84 | 		cout << "Write to a file failed!" << endl;
 85 | 		error++;
 86 | 	}
 87 | 	FILEOUT.close();
 88 | 	return(error);
 89 | }
 90 | 
 91 | 
 92 | long int File_size_row_signal(ifstream &FILEIN){
 93 | 	std::size_t count=0;
 94 | 	FILEIN.seekg(0,ios::beg);
 95 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
 96 | 	return((long int)count);
 97 | }
 98 | 
 99 | 
100 | int Load_signal(char *filename, int *nSamples, float2 **data){
101 | 	float real, imaginary;
102 | 	int file_size, cislo, error;
103 | 	error=0;
104 | 
105 | 	ifstream FILEIN;
106 | 	FILEIN.open(filename,ios::in);
107 | 	if (!FILEIN.fail()){
108 | 		error=0;
109 | 		file_size=File_size_row_signal(FILEIN);
110 | 		(*nSamples) = file_size;
111 | 		printf("nSamples:%d;\n", (*nSamples) );
112 | 
113 | 		if(file_size>0){
114 | 			*data = (float2*)malloc(file_size*sizeof(float2));
115 | 			memset( (*data), 0.0, file_size*sizeof(float2));
116 | 			if(*data==NULL){
117 | 				printf("\nAllocation error!\n");
118 | 				error++;
119 | 			}
120 | 		
121 | 			FILEIN.clear();
122 | 			FILEIN.seekg(0,ios::beg);
123 | 			
124 | 			for (cislo = 0; cislo < file_size; cislo++) {
125 | 				FILEIN >> real >> imaginary;
126 | 				(*data)[cislo].x = real;
127 | 				(*data)[cislo].y = imaginary;
128 | 			}
129 | 		}
130 | 		else {
131 | 			printf("\nFile is void of any content!\n");
132 | 			error++;
133 | 		}
134 | 	}
135 | 	else {
136 | 		cout << "File not found -> " << filename << " <-" << endl;
137 | 		error++;
138 | 	}
139 | 	FILEIN.close();
140 | 	return(error);
141 | }
142 | 
143 | 
144 | int Load_filters(char *filename, int *nFilters, int *filter_length, float2 **data){
145 | 	float real, imaginary;
146 | 	int file_size, cislo, error, filter_size;
147 | 	error=0;
148 | 
149 | 	ifstream FILEIN;
150 | 	FILEIN.open(filename,ios::in);
151 | 	if (!FILEIN.fail()){
152 | 		error=0;
153 | 		file_size = File_size_row_signal(FILEIN);
154 | 		(*filter_length) = file_size/(*nFilters);
155 | 		filter_size = (*nFilters)*(*filter_length);
156 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
157 | 
158 | 		if(file_size>0){
159 | 			*data = (float2*)malloc( filter_size*sizeof(float2));
160 | 			memset( (*data), 0.0, filter_size*sizeof(float2));
161 | 			
162 | 			if(*data==NULL){
163 | 				printf("\nAllocation error!\n");
164 | 				error++;
165 | 			}
166 | 		
167 | 			FILEIN.clear();
168 | 			FILEIN.seekg(0,ios::beg);
169 | 
170 | 			for (cislo=0; cislo < filter_size; cislo++) {
171 | 				FILEIN >> real >> imaginary;
172 | 				(*data)[cislo].x = real;
173 | 				(*data)[cislo].y = imaginary;
174 | 			}
175 | 		}
176 | 		else {
177 | 			printf("\nFile is void of any content!\n");
178 | 			error++;
179 | 		}
180 | 	}
181 | 	else {
182 | 		cout << "File not found -> " << filename << " <-" << endl;
183 | 		error++;
184 | 	}
185 | 	FILEIN.close();
186 | 	return(error);
187 | }
188 | 
189 | 
190 | int GPU_CONV(float2 *h_input, float2 *h_output, float2 *h_filters, int signal_length, int filter_length, int nFilters, float h, int nRuns, double *execution_time);
191 | 
192 | 
193 | int main(int argc, char* argv[]) {
194 | 	int nTimesamples;		// input signal length
195 | 	int filter_length;		// filter length
196 | 	int nFilters;			// number of filters
197 | 	int nRuns;
198 | 	char input_type='0';
199 | 	char input_filter_file[255];
200 | 	char input_signal_file[255];
201 | 	char output_signal_file[255];
202 | 	
203 | 	char * pEnd;
204 | 	if (argc>2) {
205 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
206 | 		input_type=*argv[1];
207 | 	}
208 | 	if (input_type == 'f' && argc==6) {
209 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
210 | 		sprintf(input_signal_file,"%s",argv[2]);
211 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
212 | 		sprintf(input_filter_file,"%s",argv[3]);
213 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
214 | 		sprintf(output_signal_file,"%s",argv[4]);
215 | 		nFilters = strtol(argv[5],&pEnd,10);
216 | 		nRuns = 1;
217 | 	}
218 | 	else if (input_type == 'r' && argc==6) {	
219 | 		nTimesamples  = strtol(argv[2],&pEnd,10);
220 | 		filter_length = strtol(argv[3],&pEnd,10);
221 | 		nFilters      = strtol(argv[4],&pEnd,10);
222 | 		nRuns         = strtol(argv[5],&pEnd,10);
223 | 	}
224 | 	else {
225 | 		printf("Parameters error!\n");
226 | 		printf(" 1) Input type: 'r' or 'f' \n");
227 | 		printf("----------------------------------\n");
228 | 		printf("'f' - file input provided by user\n");
229 | 		printf(" 2) Input signal file\n");
230 | 		printf(" 3) Input filter file\n");
231 | 		printf(" 4) Output signal file\n");
232 | 		printf(" 5) number of filters\n");
233 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n");
234 | 		printf("----------------------------------\n");
235 | 		printf(" 'r' - random input generated by the code\n");
236 | 		printf(" 2) Signal length in number of time samples\n");
237 | 		printf(" 3) Filter length in samples\n");
238 | 		printf(" 4) Number of templates\n");
239 | 		printf(" 5) number of GPU kernel runs\n");
240 | 		printf(" Example: CONV.exe r 2097152 193 32 10\n");
241 |         return 1;
242 | 	}
243 | 	
244 | 	if (DEBUG) {
245 | 		printf("Parameters:\n");
246 | 		printf("Input signal and templates are ");
247 | 		if (input_type == 'r') {
248 | 			printf("randomly generated.\n");
249 | 			printf("Signal length:     %d samples\n", nTimesamples);
250 | 			printf("Filter length:     %d samples\n", filter_length);
251 | 			printf("Number of filters: %d\n", nFilters);
252 | 			printf("nRuns:             %d\n", nRuns);
253 | 		}
254 | 		if (input_type == 'f') {
255 | 			printf("read from file.\n");
256 | 			printf("Input signal:  %s\n", input_signal_file);
257 | 			printf("Input filter:  %s\n", input_filter_file);
258 | 			printf("Output signal: %s\n", output_signal_file);
259 | 			printf("nFilters:      %d\n", nFilters);
260 | 			printf("nRuns:         %d\n", nRuns);
261 | 			printf("-----------------\n");
262 | 		}
263 | 	}
264 | 
265 | 	float2 *h_input;			// input signal
266 | 	float2 *h_output;			// output plane
267 | 	float2 *h_filters_padded;	// filters in time-domain padded with zeroes
268 | 	float2 *h_filters;		    // filters in time-domain
269 | 	
270 | 	if (input_type == 'f') {
271 | 		int error=0;
272 | 		error += Load_signal(input_signal_file, &nTimesamples, &h_input);
273 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
274 | 		if( error>0 ){exit(1);}
275 | 		else if (VERBOSE) printf("File loaded\n");
276 | 	}
277 | 	
278 | 	if (input_type == 'r') {
279 | 		h_input          = (float2 *)malloc(nTimesamples*sizeof(float2));
280 | 		h_filters	     = (float2 *)malloc(filter_length*nFilters*sizeof(float2));
281 | 		srand(time(NULL));
282 | 		Generate_signal(h_input, nTimesamples);
283 | 		Generate_templates(h_filters, filter_length, nFilters);
284 | 		if (VERBOSE) printf("Signal and filters generated\n");
285 | 	}
286 | 	
287 | 	size_t filter_size_padded = nFilters*CONV_SIZE;
288 | 	h_filters_padded = (float2*)malloc(filter_size_padded*sizeof(float2));
289 | 	Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters);
290 | 	
291 | 	//----------------> Results
292 | 	double execution_time = 0;
293 | 	Performance_results CONV_cuFFT;
294 | 	CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT");
295 | 	
296 | 	int offset           = filter_length/2; // we assume that filter is centered around zero
297 | 	int useful_part_size = CONV_SIZE - filter_length + 1;
298 | 	int nConvolutions    = (nTimesamples + useful_part_size - 1)/useful_part_size;
299 | 
300 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
301 | 	if(DEBUG) {
302 | 		printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions);
303 | 	}
304 | 	
305 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
306 | 	h_output = (float2*)malloc(output_size*sizeof(float2));
307 | 	
308 | 	if (VERBOSE) printf("Convolution - cuFFT\n");
309 | 
310 | 	//----------------> GPU kernel
311 | 	float h = 20.0;
312 | 	int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, h, nRuns, &execution_time);
313 | 	CONV_cuFFT.GPU_time = execution_time;
314 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
315 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
316 | 	if(WRITE && GPU_error==0) CONV_cuFFT.Save();
317 | 	//----------------> GPU kernel
318 | 	
319 | 	if(CHECK){
320 | 		double total_error, mean_error;
321 | 		printf("Checking results...\n");
322 | 		Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, h, &total_error, &mean_error);
323 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
324 | 	}
325 | 	
326 | 	if (input_type == 'f') {
327 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
328 | 	}
329 | 	
330 | 	free(h_input);
331 | 	free(h_output);
332 | 	free(h_filters_padded);
333 | 	free(h_filters);
334 | 
335 | 	cudaDeviceReset();
336 | 
337 | 	if (VERBOSE) printf("Finished!\n");
338 | 
339 | 	return (0);
340 | }
341 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3  -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | ifdef reglim
18 | NVCCFLAGS += --maxrregcount=$(reglim)
19 | endif
20 | 
21 | all: clean analyze
22 | 
23 | analyze: CONV_C2C.o CONV-32bit_cuFFT.o Makefile
24 | 	$(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_C2C.o $(LIB) $(NVCCFLAGS) 
25 | 
26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h
27 | 	$(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64
28 | 
29 | CONV_C2C.o: CONV_C2C.cpp
30 | 	$(GCC) -c CONV_C2C.cpp $(GCC_OPTS)
31 | 
32 | clean:	
33 | 	rm -f *.o *.~ $(ANALYZE)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/When_cuFFT_wins_cuFFT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 	echo "#define CONV_SIZE $convsize" > params.h
 6 | 	
 7 | 	rm CONV.exe
 8 | 	make
 9 | 	for tempsize in {64..4096..32}
10 | 	do
11 | 		for templates in 32;
12 | 		do
13 | 			./CONV.exe r 2097152 $tempsize $templates 20 0
14 | 		done
15 | 	done
16 | done
17 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_cuFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_cuFFT.dat OLS_cuFFT_callbacks_pp_perf.dat;
7 | 
8 | ./When_cuFFT_wins_cuFFT.sh
9 | mv CONV_cuFFT.dat OLS_cuFFT_callbacks_pp_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 
 6 | echo "#define CONV_SIZE $convsize" > params.h		
 7 | rm CONV.exe
 8 | make
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do	
11 | 		for templates in 2 4 8 11 16 32 51 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $templates 20
14 | 			./CONV.exe r 524288 $tempsize $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $templates 20
19 | 		done
20 | 	done
21 | done
22 | 
23 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/conv_check.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int nFilters){
 13 | 	for(int f=0; f<nFilters; f++){
 14 | 		printf(".");  fflush(stdout);
 15 | 		for(int s=0; s<signal_length; s++){
 16 | 			float2 ac;
 17 | 			ac.x = 0; ac.y = 0;
 18 | 			for(int i=0; i<filter_length; i++){
 19 | 				int filter_pos = filter_length - 1 - i;
 20 | 				float2 fv, sv;
 21 | 				fv = h_filters[f*filter_length + filter_pos];
 22 | 				int signal_pos = (s + i - (filter_length>>1));
 23 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 24 | 				else {sv.x = 0; sv.y = 0;}
 25 | 				ac.x = ac.x + sv.x*fv.x - sv.y*fv.y;
 26 | 				ac.y = ac.y + sv.x*fv.y + sv.y*fv.x;
 27 | 			}
 28 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 29 | 		}
 30 | 	}
 31 | 	printf("\n");
 32 | }
 33 | 
 34 | void CPU_postprocess(float2 *h_CPU_postprocessed, float2 *h_CPU_output_reduced, int nTimesamples, int nFilters, float h){
 35 | 	float2 left, right, result;
 36 | 	
 37 | 	for(int f=0; f<nFilters; f++){
 38 | 		for(int s=0; s<nTimesamples-1; s++){
 39 | 			int pos = f*nTimesamples + s;
 40 | 			if( s==0 ) {
 41 | 				left = h_CPU_output_reduced[pos];
 42 | 			}
 43 | 			else {
 44 | 				left = h_CPU_output_reduced[pos-1];
 45 | 			}
 46 | 			
 47 | 			if( s>=(nTimesamples-1) ) {
 48 | 				right = h_CPU_output_reduced[f*nTimesamples + nTimesamples - 1];
 49 | 			}
 50 | 			else {
 51 | 				right = h_CPU_output_reduced[pos+1];
 52 | 			}
 53 | 			
 54 | 			result.x = (left.x - right.x)/(2.0*h);
 55 | 			result.y = (left.y - right.y)/(2.0*h);
 56 | 			h_CPU_postprocessed[pos] = result;
 57 | 		}
 58 | 	}
 59 | }
 60 | 
 61 | 
 62 | float get_error(float2 A_f2, float2 B_f2){
 63 | 	float error, div_error=10000, per_error=10000, order=0;
 64 | 	int power;
 65 | 	float A = max(A_f2.x, A_f2.y);
 66 | 	float B = max(B_f2.x, B_f2.y);
 67 | 	if(A<0) A = -A;
 68 | 	if(B<0) B = -B;
 69 | 	
 70 | 	if (A>B) {
 71 | 		div_error = A-B;
 72 | 		if(B>10){
 73 | 			power = (int) log10(B);
 74 | 			order = pow(10,power);
 75 | 			div_error = div_error/order;
 76 | 		}
 77 | 	}
 78 | 	else {
 79 | 		div_error = B-A;
 80 | 		if(A>10){
 81 | 			power = (int) log10(A);
 82 | 			order = pow(10,power);
 83 | 			div_error = div_error/order;
 84 | 		}
 85 | 	}
 86 | 	
 87 | 	if(div_error<per_error) error = div_error;
 88 | 	else error = per_error;
 89 | 	return(error);
 90 | }
 91 | 
 92 | int Compare_data(float2 *CPU_result, float2 *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
 93 | 	double total_error_l = 0, mean_error_l = 0;
 94 | 	size_t nErrors = 0;
 95 | 	int cislo = 0;
 96 | 	float error;
 97 | 	
 98 | 	for(int y=0; y<dim_y; y++){
 99 | 		for(int x=0; x<nSamples; x++){
100 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
101 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
102 | 			if((x + CPU_offset)<CPU_dim_x && (x + GPU_offset)<GPU_dim_x){
103 | 				float2 CPU, GPU;
104 | 				CPU.x = CPU_result[CPU_pos].x/CPU_scale; CPU.y = CPU_result[CPU_pos].y/CPU_scale;
105 | 				GPU.x = GPU_result[GPU_pos].x/GPU_scale; GPU.y = GPU_result[GPU_pos].y/GPU_scale;
106 | 				
107 | 				
108 | 				error = get_error(CPU, GPU);
109 | 				total_error_l = total_error_l + error;
110 | 				if( error > max_error ){
111 | 					nErrors++;
112 | 					if(cislo<40){
113 | 						printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) ((x + GPU_offset)/useful_part_size), (x + GPU_offset)%useful_part_size);
114 | 						cislo++;
115 | 					}
116 | 				}
117 | 			}
118 | 		}
119 | 	}
120 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
121 | 	(*total_error) = total_error_l;
122 | 	(*mean_error) = mean_error_l;
123 | 	return(nErrors);
124 | }
125 | 
126 | 
127 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){
128 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
129 | 	float2 *h_CPU_output_timedomain;
130 | 	float2 *h_CPU_postprocessed;
131 | 	h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2));
132 | 	h_CPU_postprocessed     = (float2 *)malloc(output_size_timedomain*sizeof(float2));
133 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2));
134 | 	memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float2));
135 | 	
136 | 	printf("\n--> Time-domain convolution:");
137 | 	CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters);
138 | 	
139 | 	printf("\n--> Post-processing:\n");
140 | 	CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h);
141 | 		
142 | 	float GPU_scale, CPU_scale;
143 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
144 | 
145 | 	#ifdef POST_PROCESS
146 | 	
147 | 	printf("\n--> Comparison to CPU time-domain with post-processing:\n");
148 | 	GPU_scale = conv_length;
149 | 	CPU_scale = 1.0;
150 | 	GPU_offset = 0;
151 | 	CPU_offset = 0;
152 | 	GPU_dim_x = nConvolutions*useful_part_size;
153 | 	CPU_dim_x = (signal_length + filter_length - 1);
154 | 	nSamples = signal_length - offset;	
155 | 	Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
156 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
157 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
158 | 	else printf("FAILED\n");
159 | 	
160 | 	#else
161 | 	
162 | 	printf("\n--> Comparison to CPU time-domain:\n");
163 | 	GPU_scale = conv_length;
164 | 	CPU_scale = 1.0;
165 | 	GPU_offset = 0;
166 | 	CPU_offset = 0;
167 | 	GPU_dim_x = nConvolutions*useful_part_size;
168 | 	CPU_dim_x = (signal_length + filter_length - 1);
169 | 	nSamples = signal_length-offset;
170 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
171 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
172 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
173 | 	else printf("FAILED\n");
174 | 	
175 | 	#endif
176 | 	
177 | 	free(h_CPU_output_timedomain);
178 | 	free(h_CPU_postprocessed);
179 | }
180 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/debug.h:
--------------------------------------------------------------------------------
1 | #define VERBOSE true
2 | #define DEBUG false
3 | #define CHECK false
4 | #define WRITE true
5 | 
6 | #define DEVICEID 0
7 | #define POST_PROCESS
8 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/params.h:
--------------------------------------------------------------------------------
1 | #define CONV_SIZE 8192
2 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_cuFFT_callbacks_pp/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/CONV_SM_OLS_C2C.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | #include "debug.h"
  9 | #include "params.h"
 10 | #include "results.h"
 11 | 
 12 | #include <stdio.h>
 13 | #include <string.h>
 14 | #include <math.h>
 15 | #include <time.h>
 16 | #include <stdlib.h>
 17 | #include <cuda.h>
 18 | #include <cuda_runtime.h>
 19 | #include <cuda_runtime_api.h>
 20 | 
 21 | 
 22 | #include "conv_check.h"
 23 | 
 24 | 
 25 | void Generate_random_signal(float2 *h_input, int signal_length){
 26 | 	for(int f=0; f<signal_length; f++){
 27 | 		h_input[f].y=rand() / (float)RAND_MAX;
 28 | 		h_input[f].x=rand() / (float)RAND_MAX;
 29 | 	}
 30 | }
 31 | 
 32 | void Generate_signal(float2 *h_input, int nTimesamples){
 33 | 	for(int f=0; f<nTimesamples; f++){
 34 | 		h_input[f].y=rand() / (float)RAND_MAX;
 35 | 		h_input[f].x=rand() / (float)RAND_MAX;
 36 | 	}
 37 | 	
 38 | 	if(nTimesamples>15000){
 39 | 		for(int f=15000; f<nTimesamples; f++){
 40 | 			h_input[f].x = f%4096;
 41 | 		}
 42 | 		
 43 | 		for(int f=0; f<192; f++){
 44 | 			h_input[f + 5300].x = 10.0;
 45 | 		}
 46 | 		
 47 | 		for(int f=0; f<128; f++){
 48 | 			h_input[f + 8626].x = 10.0;
 49 | 		}
 50 | 		
 51 | 		for(int f=0; f<36; f++){
 52 | 			h_input[f + 9626].x = 10.0;
 53 | 		}
 54 | 		
 55 | 		for(int f=0; f<83; f++){
 56 | 			h_input[f + 10626].x = 10.0;
 57 | 		}
 58 | 		
 59 | 		for(int f=0; f<138; f++){
 60 | 			h_input[f + 11626].x = 10.0;
 61 | 		}
 62 | 	}
 63 | }
 64 | 
 65 | void Generate_random_filter(float2 *h_filters, int nSamples, int nFilters){
 66 | 	for(int t=0; t<nFilters; t++){
 67 | 		for(int f=0; f<nSamples; f++){
 68 | 			h_filters[t*nSamples + f].y=rand() / (float)RAND_MAX;
 69 | 			h_filters[t*nSamples + f].x=rand() / (float)RAND_MAX;
 70 | 		}
 71 | 	}
 72 | }
 73 | 
 74 | void Generate_boxcar_filter(float2 *h_filters, int nSamples, int nFilters){
 75 | 	int boxcar_width;
 76 | 	for(int t=0; t<nFilters; t++){
 77 | 		boxcar_width = ((t+1)*8);
 78 | 		for(int f=0; f<nSamples; f++){
 79 | 			if( f>=(nSamples/2-boxcar_width/2) && f<( nSamples/2+boxcar_width/2) ){
 80 | 				h_filters[t*nSamples + f].x=1;
 81 | 				h_filters[t*nSamples + f].y=0;
 82 | 			}
 83 | 			else {
 84 | 				h_filters[t*nSamples + f].x=0;
 85 | 				h_filters[t*nSamples + f].y=0;
 86 | 			}
 87 | 		}
 88 | 	}
 89 | }
 90 | 
 91 | void Pad_templates(float2 *h_filters_time, float2 *h_filters_padded, int filter_length, int corrected_filter_length, int convolution_size, int nFilters){
 92 | 	float2 *tmp_filter;
 93 | 	tmp_filter = new float2[corrected_filter_length];
 94 | 	
 95 | 	for(int f=0; f<nFilters*convolution_size; f++){
 96 | 		h_filters_padded[f].x=0;
 97 | 		h_filters_padded[f].y=0;
 98 | 	}
 99 | 	
100 | 	for(int t=0; t<nFilters; t++){
101 | 		// copy to temporary filter
102 | 		if(filter_length!=corrected_filter_length) {
103 | 			tmp_filter[0].x = 0;
104 | 			tmp_filter[0].y = 0;
105 | 			for(int f=0; f<filter_length; f++){
106 | 				tmp_filter[f + 1].x = h_filters_time[t*filter_length + f].x;
107 | 				tmp_filter[f + 1].y = h_filters_time[t*filter_length + f].y;
108 | 			}
109 | 		}
110 | 		else {
111 | 			for(int f=0; f<filter_length; f++){
112 | 				tmp_filter[f].x = h_filters_time[t*filter_length + f].x;
113 | 				tmp_filter[f].y = h_filters_time[t*filter_length + f].y;
114 | 			}
115 | 		}
116 | 		
117 | 		for(int f=0; f<corrected_filter_length; f++){
118 | 			// padding for centered filter
119 | 			if(f>=corrected_filter_length/2) {
120 | 				h_filters_padded[t*convolution_size + f - corrected_filter_length/2].x=tmp_filter[f].x;
121 | 				h_filters_padded[t*convolution_size + f - corrected_filter_length/2].y=tmp_filter[f].y;
122 | 			}
123 | 			else if(f<corrected_filter_length/2) {
124 | 				h_filters_padded[t*convolution_size + f + convolution_size - corrected_filter_length/2].x = tmp_filter[f].x;
125 | 				h_filters_padded[t*convolution_size + f + convolution_size - corrected_filter_length/2].y = tmp_filter[f].y;
126 | 			}
127 | 		}
128 | 	}
129 | 	
130 | 	delete [] tmp_filter;
131 | }
132 | 
133 | 
134 | int Write_output(float2 *h_output, int signal_length, int nFilters, char *output_signal_file){
135 | 	int error=0;
136 | 	ofstream FILEOUT;
137 | 	FILEOUT.open(output_signal_file);
138 | 	if (!FILEOUT.fail()){
139 | 		if (VERBOSE) printf("Writing output\n");
140 | 		for(int f=0; f<nFilters; f++){
141 | 			if (VERBOSE) printf("[");
142 | 			for(int Ts=0;Ts<signal_length;Ts++){
143 | 				if(Ts%100000==0) {
144 | 					if (VERBOSE) {
145 | 						printf(".");
146 | 						fflush(stdout);
147 | 					}
148 | 				}
149 | 				FILEOUT << f << " " << Ts << " " << h_output[f*signal_length+Ts].x << " " << h_output[f*signal_length+Ts].y << endl;
150 | 			}
151 | 			FILEOUT << endl;
152 | 			if (VERBOSE) printf("] filter=%d\n",f);
153 | 		}
154 | 	}
155 | 	else {
156 | 		cout << "Write to a file failed!" << endl;
157 | 		error++;
158 | 	}
159 | 	FILEOUT.close();
160 | 	return(error);
161 | }
162 | 
163 | 
164 | long int File_size_row_signal(ifstream &FILEIN){
165 | 	std::size_t count=0;
166 | 	FILEIN.seekg(0,ios::beg);
167 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
168 | 	return((long int)count);
169 | }
170 | 
171 | 
172 | int Load_signal(char *filename, int *nSamples, float2 **data){
173 | 	float real, imaginary;
174 | 	int file_size, cislo, error;
175 | 	error=0;
176 | 
177 | 	ifstream FILEIN;
178 | 	FILEIN.open(filename,ios::in);
179 | 	if (!FILEIN.fail()){
180 | 		error=0;
181 | 		file_size=File_size_row_signal(FILEIN);
182 | 		(*nSamples) = file_size;
183 | 		printf("nSamples:%d;\n", (*nSamples) );
184 | 
185 | 		if(file_size>0){
186 | 			*data = (float2*)malloc(file_size*sizeof(float2));
187 | 			memset( (*data), 0.0, file_size*sizeof(float2));
188 | 			if(*data==NULL){
189 | 				printf("\nAllocation error!\n");
190 | 				error++;
191 | 			}
192 | 		
193 | 			FILEIN.clear();
194 | 			FILEIN.seekg(0,ios::beg);
195 | 			
196 | 			for (cislo = 0; cislo < file_size; cislo++) {
197 | 				FILEIN >> real >> imaginary;
198 | 				(*data)[cislo].x = real;
199 | 				(*data)[cislo].y = imaginary;
200 | 			}
201 | 		}
202 | 		else {
203 | 			printf("\nFile is void of any content!\n");
204 | 			error++;
205 | 		}
206 | 	}
207 | 	else {
208 | 		cout << "File not found -> " << filename << " <-" << endl;
209 | 		error++;
210 | 	}
211 | 	FILEIN.close();
212 | 	return(error);
213 | }
214 | 
215 | 
216 | int Load_filters(char *filename, int *nFilters, int *filter_length, float2 **data){
217 | 	float real, imaginary;
218 | 	int file_size, cislo, error, filter_size;
219 | 	error=0;
220 | 
221 | 	ifstream FILEIN;
222 | 	FILEIN.open(filename,ios::in);
223 | 	if (!FILEIN.fail()){
224 | 		error=0;
225 | 		file_size = File_size_row_signal(FILEIN);
226 | 		(*filter_length) = file_size/(*nFilters);
227 | 		filter_size = (*nFilters)*(*filter_length);
228 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
229 | 
230 | 		if(file_size>0){
231 | 			*data = (float2*)malloc( filter_size*sizeof(float2));
232 | 			memset( (*data), 0.0, filter_size*sizeof(float2));
233 | 			
234 | 			if(*data==NULL){
235 | 				printf("\nAllocation error!\n");
236 | 				error++;
237 | 			}
238 | 		
239 | 			FILEIN.clear();
240 | 			FILEIN.seekg(0,ios::beg);
241 | 
242 | 			for (cislo=0; cislo < filter_size; cislo++) {
243 | 				FILEIN >> real >> imaginary;
244 | 				(*data)[cislo].x = real;
245 | 				(*data)[cislo].y = imaginary;
246 | 			}
247 | 		}
248 | 		else {
249 | 			printf("\nFile is void of any content!\n");
250 | 			error++;
251 | 		}
252 | 	}
253 | 	else {
254 | 		cout << "File not found -> " << filename << " <-" << endl;
255 | 		error++;
256 | 	}
257 | 	FILEIN.close();
258 | 	return(error);
259 | }
260 | 
261 | 
262 | int GPU_convolution_OLS_customFFT(float2 *h_input_signal, float2 *h_output_plane, float2 *h_filters, int signal_length, int convolution_length, int filter_length, int past_filter_samples, int nFilters, int nRuns, int kernel_type, double *execution_time);
263 | 
264 | 
265 | int main(int argc, char* argv[]) {
266 | 	int signal_length;
267 | 	int filter_length;
268 | 	int past_filter_samples;
269 | 	int convolution_length;
270 | 	int nFilters;
271 | 	int nRuns;
272 | 	char input_type='0';
273 | 	char input_filter_file[255];
274 | 	char input_signal_file[255];
275 | 	char output_signal_file[255];
276 | 	
277 | 	char * pEnd;
278 | 	if (argc>2) {
279 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
280 | 		input_type=*argv[1];
281 | 	}
282 | 	if (input_type == 'f' && argc==8) {
283 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
284 | 		sprintf(input_signal_file,"%s",argv[2]);
285 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
286 | 		sprintf(input_filter_file,"%s",argv[3]);
287 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
288 | 		sprintf(output_signal_file,"%s",argv[4]);
289 | 		
290 | 		convolution_length = strtol(argv[5],&pEnd,10);
291 | 		nFilters = strtol(argv[6],&pEnd,10);
292 | 		past_filter_samples = strtol(argv[7],&pEnd,10);
293 | 		nRuns = 1;
294 | 	}
295 | 	else if (input_type == 'r' && argc==8) {
296 | 		signal_length  = strtol(argv[2],&pEnd,10);
297 | 		filter_length = strtol(argv[3],&pEnd,10);
298 | 		past_filter_samples = strtol(argv[4],&pEnd,10);
299 | 		convolution_length = strtol(argv[5],&pEnd,10);
300 | 		nFilters      = strtol(argv[6],&pEnd,10);
301 | 		
302 | 		nRuns = strtol(argv[7],&pEnd,10);
303 | 	}
304 | 	else {
305 | 		printf("Parameters error!\n");
306 | 		printf(" 1) Input type: 'r' or 'f' \n");
307 | 		printf("----------------------------------\n");
308 | 		printf("Parameters if input type is 'f' - file input provided by user\n");
309 | 		printf(" 2) Input signal file\n");
310 | 		printf(" 3) Input filter file\n");
311 | 		printf(" 4) Output signal file\n");
312 | 		printf(" 5) Convolution length in samples\n");
313 | 		printf(" 6) number of filters\n");
314 | 		printf(" 7) number of past samples in the filter.\n");
315 | 		printf("    for past filter (causal) it is (filter_length - 1)\n");
316 | 		printf("    for odd centered filter it is floor(filter_length/2)\n");
317 | 		printf("    for future filter it is 0\n");
318 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192\n");
319 | 		printf("----------------------------------\n");
320 | 		printf("Parameters if input type is 'r' - random input generated by the code\n");
321 | 		printf(" 2) Signal length in number of time samples\n");
322 | 		printf(" 3) Filter length in samples\n");
323 | 		printf(" 4) number of past samples in the filter.\n");
324 | 		printf("    for past filter (causal) it is (filter_length - 1)\n");
325 | 		printf("    for odd centered filter it is floor(filter_length/2)\n");
326 | 		printf("    for future filter it is 0\n");
327 | 		printf(" 5) Convolution length in samples\n");
328 | 		printf(" 6) Number of filters\n");
329 | 		printf(" 7) number of GPU kernel runs\n");
330 | 		printf(" Example: CONV.exe r 2097152 193 192 2048 32 10\n");
331 |         return 1;
332 | 	}
333 | 	
334 | 	if (DEBUG) {
335 | 		printf("Parameters:\n");
336 | 		printf("Input signal and filters are ");
337 | 		if (input_type == 'r') {
338 | 			printf("randomly generated.\n");
339 | 			printf("Signal length:      %d samples\n", signal_length);
340 | 			printf("Filter length:      %d samples\n", filter_length);
341 | 			printf("# of past samples:  %d samples\n", past_filter_samples);
342 | 			printf("Convolution length: %d samples\n", convolution_length);
343 | 			printf("Number of filters:  %d\n", nFilters);
344 | 			printf("nRuns:              %d\n", nRuns);
345 | 		}
346 | 		if (input_type == 'f') {
347 | 			printf("read from file.\n");
348 | 			printf("Input signal:       %s\n", input_signal_file);
349 | 			printf("Input filter:       %s\n", input_filter_file);
350 | 			printf("Output signal:      %s\n", output_signal_file);
351 | 			printf("Convolution length: %d samples\n", convolution_length);
352 | 			printf("nFilters:           %d\n", nFilters);
353 | 			printf("# of past samples:  %d samples\n", past_filter_samples);
354 | 			printf("nRuns:              %d\n", nRuns);
355 | 			printf("-----------------\n");
356 | 		}
357 | 	}
358 | 
359 | 	float2 *h_input;
360 | 	float2 *h_output;
361 | 	float2 *h_filters;		    // filters in time-domain
362 | 	float2 *h_filters_padded;	// filters in time-domain padded with zeroes
363 | 	
364 | 	if (input_type == 'f') {
365 | 		int error=0;
366 | 		error += Load_signal(input_signal_file, &signal_length, &h_input);
367 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
368 | 		if( error>0 ){exit(1);}
369 | 		else if (VERBOSE) printf("File loaded\n");
370 | 	}
371 | 
372 | 	//----------------> Results
373 | 	double execution_time = 0;
374 | 	Performance_results CONV_cuFFT;
375 | 	CONV_cuFFT.Assign(signal_length, filter_length, nFilters, nRuns, 0, convolution_length, nFilters, "CONV_kFFT.dat", "one");
376 | 	
377 | 	
378 | 	int corrected_filter_length;
379 | 	if( filter_length%2==0 ) corrected_filter_length = filter_length + 1;
380 | 	else corrected_filter_length = filter_length;
381 | 	int useful_part_size = convolution_length - corrected_filter_length + 1;
382 | 	int nConvolutions    = signal_length/useful_part_size;
383 | 	if( (signal_length%useful_part_size)>0 ) nConvolutions++;
384 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
385 | 
386 | 	
387 | 	if (input_type == 'r') {
388 | 		h_input          = (float2 *)malloc(signal_length*sizeof(float2));
389 | 		h_filters	     = (float2 *)malloc(filter_length*nFilters*sizeof(float2));
390 | 		srand(time(NULL));
391 | 		Generate_random_signal(h_input, signal_length);
392 | 		Generate_random_filter(h_filters, filter_length, nFilters);
393 | 		if (VERBOSE) printf("Signal and filters generated\n");
394 | 	}
395 | 	
396 | 	size_t filter_size_padded = nFilters*convolution_length;
397 | 	h_filters_padded = (float2*)malloc(filter_size_padded*sizeof(float2));
398 | 	Pad_templates(h_filters, h_filters_padded, filter_length, corrected_filter_length, convolution_length, nFilters);
399 | 	
400 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
401 | 	h_output = (float2*)malloc(output_size*sizeof(float2));
402 | 	
403 | 	if (VERBOSE) printf("Convolution - kFFT\n");
404 | 
405 | 	//----------------> GPU kernel
406 | 	int kernel_type=1; //one filter per iteration
407 | 	GPU_convolution_OLS_customFFT(h_input, h_output, h_filters_padded, signal_length, convolution_length, corrected_filter_length, past_filter_samples, nFilters, nRuns, kernel_type, &execution_time);
408 | 	CONV_cuFFT.GPU_time = execution_time;
409 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
410 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
411 | 	if(WRITE) CONV_cuFFT.Save();
412 | 	//----------------> GPU kernel
413 | 
414 | 	if(CHECK){
415 | 		double total_error, mean_error;
416 | 		printf("Checking results...\n");
417 | 		Full_CONV_check(h_output, h_input, h_filters, signal_length, filter_length, past_filter_samples, useful_part_size, convolution_length, nConvolutions, nFilters, &total_error, &mean_error);
418 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
419 | 	}
420 | 	
421 | 	if (input_type == 'f') {
422 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
423 | 	}
424 | 	
425 | 	free(h_input);
426 | 	free(h_output);
427 | 	free(h_filters_padded);
428 | 	free(h_filters);
429 | 
430 | 	cudaDeviceReset();
431 | 
432 | 	if (VERBOSE) printf("Finished!\n");
433 | 
434 | 	return (0);
435 | }
436 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | 
18 | ifdef reglim
19 | NVCCFLAGS += --maxrregcount=$(reglim)
20 | endif
21 | 
22 | all: clean onefilter
23 | 
24 | onefilter: CONV_SM_OLS_C2C.o  CONV-32bit_customFFT.o Makefile
25 | 	$(NVCC) -o CONV.exe CONV_SM_OLS_C2C.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS)
26 | 
27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h
28 | 	$(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS)
29 | 
30 | CONV_SM_OLS_C2C.o: CONV_SM_OLS_C2C.cpp
31 | 	$(GCC) -c CONV_SM_OLS_C2C.cpp $(GCC_OPTS)
32 | 	
33 | clean:	
34 | 	rm -f *.o *.~ CONV_*.exe
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/When_cuFFT_wins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm CONV_1f.exe;
 4 | rm CONV_2f.exe;
 5 | rm *.o;
 6 | make reglim=0 > /dev/null 2>&1
 7 | for convlength in 256 512 1024 2048 4096;
 8 | do 
 9 | 	for tempsize in {64..4096..32}
10 | 	do
11 | 		./CONV.exe r 2097152 $tempsize $convlength 32 20
12 | 	done
13 | done
14 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_kFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_kFFT.dat OLS_SM_C2C_perf.dat;
7 | 
8 | ./When_cuFFT_wins.sh
9 | mv CONV_kFFT.dat OLS_SM_C2C_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | rm CONV.exe;
 5 | rm *.o;
 6 | make reglim=$reg > /dev/null 2>&1
 7 | for convlength in 256 512 1024 2048 4096
 8 | do
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do
11 | 		for templates in 2 4 8 16 32 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $convlength $templates 20
14 | 			./CONV.exe r 524288 $tempsize $convlength $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $convlength $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $convlength $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $convlength $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $convlength $templates 20
19 | 		done
20 | 	done
21 | done
22 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/conv_check.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){
 13 | 	for(int f=0; f<nFilters; f++){
 14 | 		printf(".");  fflush(stdout);
 15 | 		for(int s=0; s<signal_length; s++){
 16 | 			float2 ac;
 17 | 			ac.x = 0; ac.y = 0;
 18 | 			for(int i=0; i<filter_length; i++){
 19 | 				int filter_pos = filter_length - 1 - i;
 20 | 				float2 fv, sv;
 21 | 				fv = h_filters[f*filter_length + filter_pos];
 22 | 				int signal_pos = (s + i - past_filter_samples);
 23 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 24 | 				else {sv.x = 0; sv.y = 0;}
 25 | 				ac.x = ac.x + sv.x*fv.x - sv.y*fv.y;
 26 | 				ac.y = ac.y + sv.x*fv.y + sv.y*fv.x;
 27 | 			}
 28 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 29 | 		}
 30 | 	}
 31 | 	printf("\n");
 32 | }
 33 | 
 34 | 
 35 | float get_error(float2 A_f2, float2 B_f2){
 36 | 	float error, div_error=10000, per_error=10000, order=0;
 37 | 	int power;
 38 | 	float A = max(A_f2.x, A_f2.y);
 39 | 	float B = max(B_f2.x, B_f2.y);
 40 | 	if(A<0) A = -A;
 41 | 	if(B<0) B = -B;
 42 | 	
 43 | 	if (A>B) {
 44 | 		div_error = A-B;
 45 | 		if(B>10){
 46 | 			power = (int) log10(B);
 47 | 			order = pow(10,power);
 48 | 			div_error = div_error/order;
 49 | 		}
 50 | 	}
 51 | 	else {
 52 | 		div_error = B-A;
 53 | 		if(A>10){
 54 | 			power = (int) log10(A);
 55 | 			order = pow(10,power);
 56 | 			div_error = div_error/order;
 57 | 		}
 58 | 	}
 59 | 	
 60 | 	if(div_error<per_error) error = div_error;
 61 | 	else error = per_error;
 62 | 	return(error);
 63 | }
 64 | 
 65 | int Compare_data(float2 *CPU_result, float2 *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
 66 | 	double total_error_l = 0, mean_error_l = 0;
 67 | 	size_t nErrors = 0;
 68 | 	int cislo = 0;
 69 | 	float error;
 70 | 	
 71 | 	for(int y=0; y<dim_y; y++){
 72 | 		for(int x=0; x<nSamples; x++){
 73 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
 74 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
 75 | 			float2 CPU, GPU;
 76 | 			CPU.x = CPU_result[CPU_pos].x/CPU_scale; CPU.y = CPU_result[CPU_pos].y/CPU_scale;
 77 | 			GPU.x = GPU_result[GPU_pos].x/GPU_scale; GPU.y = GPU_result[GPU_pos].y/GPU_scale;
 78 | 			
 79 | 			
 80 | 			error = get_error(CPU, GPU);
 81 | 			total_error_l = total_error_l + error;
 82 | 			if( error > max_error ){
 83 | 				nErrors++;
 84 | 				if(cislo<40){
 85 | 					printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) (x/useful_part_size), x%useful_part_size);
 86 | 					cislo++;
 87 | 				}
 88 | 			}
 89 | 		}
 90 | 	}
 91 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
 92 | 	(*total_error) = total_error_l;
 93 | 	(*mean_error) = mean_error_l;
 94 | 	return(nErrors);
 95 | }
 96 | 
 97 | 
 98 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){
 99 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
100 | 	float2 *h_CPU_output_timedomain;
101 | 	h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2));
102 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2));
103 | 	
104 | 	printf("\n--> Time-domain convolution:");
105 | 	CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters);
106 | 		
107 | 	float GPU_scale, CPU_scale;
108 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
109 | 	
110 | 	printf("\n--> Comparison to CPU time-domain:\n");
111 | 	GPU_scale = conv_length;
112 | 	GPU_scale = 1.0;
113 | 	CPU_scale = 1.0;
114 | 	GPU_offset = 0;
115 | 	CPU_offset = 0;
116 | 	GPU_dim_x = nConvolutions*useful_part_size;
117 | 	CPU_dim_x = (signal_length + filter_length - 1);
118 | 	nSamples = signal_length;
119 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
120 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
121 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
122 | 	else printf("FAILED\n");
123 | 	
124 | 	free(h_CPU_output_timedomain);
125 | }
126 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/debug.h:
--------------------------------------------------------------------------------
1 | #define VERBOSE true
2 | #define DEBUG false
3 | #define WRITE true
4 | #define CHECK false
5 | 
6 | #define DEVICEID 0
7 | 
8 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/params.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_C2C_sharedmemory/params.h


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3 -arch=sm_60 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | 
18 | ifdef reglim
19 | NVCCFLAGS += --maxrregcount=$(reglim)
20 | endif
21 | 
22 | all: clean onefilter
23 | 
24 | onefilter: CONV_SM_OLS_C2C.o  CONV-32bit_customFFT.o Makefile
25 | 	$(NVCC) -o CONV.exe CONV_SM_OLS_C2C.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS)
26 | 
27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h
28 | 	$(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS)
29 | 
30 | CONV_SM_OLS_C2C.o: CONV_SM_OLS_C2C.cpp
31 | 	$(GCC) -c CONV_SM_OLS_C2C.cpp $(GCC_OPTS)
32 | 	
33 | clean:	
34 | 	rm -f *.o *.~ CONV_*.exe
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/When_cuFFT_wins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm CONV.exe;
 4 | rm *.o;
 5 | make reglim=0 > /dev/null 2>&1
 6 | for convlength in 256 512 1024 2048 4096;
 7 | do 
 8 | 	for tempsize in {64..4096..32}
 9 | 	do
10 | 		./CONV.exe r 2097152 $tempsize $convlength 32 20
11 | 	done
12 | done
13 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_kFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_kFFT.dat OLS_SM_C2C_pp_perf.dat;
7 | 
8 | ./When_cuFFT_wins.sh
9 | mv CONV_kFFT.dat OLS_SM_C2C_pp_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | rm CONV.exe;
 5 | rm *.o;
 6 | make reglim=$reg > /dev/null 2>&1
 7 | for convlength in 256 512 1024 2048 4096
 8 | do
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do
11 | 		for templates in 2 4 8 16 32 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $convlength $templates 20
14 | 			./CONV.exe r 524288 $tempsize $convlength $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $convlength $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $convlength $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $convlength $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $convlength $templates 20
19 | 		done
20 | 	done
21 | done
22 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/conv_check.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | void CPU_time_domain(float2 *h_input, float2 *h_CPU_output_timedomain, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){
 13 | 	for(int f=0; f<nFilters; f++){
 14 | 		printf(".");  fflush(stdout);
 15 | 		for(int s=0; s<signal_length; s++){
 16 | 			float2 ac;
 17 | 			ac.x = 0; ac.y = 0;
 18 | 			for(int i=0; i<filter_length; i++){
 19 | 				int filter_pos = filter_length - 1 - i;
 20 | 				float2 fv, sv;
 21 | 				fv = h_filters[f*filter_length + filter_pos];
 22 | 				int signal_pos = (s + i - past_filter_samples);
 23 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 24 | 				else {sv.x = 0; sv.y = 0;}
 25 | 				ac.x = ac.x + sv.x*fv.x - sv.y*fv.y;
 26 | 				ac.y = ac.y + sv.x*fv.y + sv.y*fv.x;
 27 | 			}
 28 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 29 | 		}
 30 | 	}
 31 | 	printf("\n");
 32 | }
 33 | 
 34 | void CPU_postprocess(float2 *h_CPU_postprocessed, float2 *h_CPU_output_reduced, int nTimesamples, int nFilters, float h){
 35 | 	float2 left, right, result;
 36 | 	
 37 | 	for(int f=0; f<nFilters; f++){
 38 | 		for(int s=0; s<nTimesamples-1; s++){
 39 | 			int pos = f*nTimesamples + s;
 40 | 			if( s==0 ) {
 41 | 				left = h_CPU_output_reduced[pos];
 42 | 			}
 43 | 			else {
 44 | 				left = h_CPU_output_reduced[pos-1];
 45 | 			}
 46 | 			
 47 | 			if( s>=(nTimesamples-1) ) {
 48 | 				right = h_CPU_output_reduced[f*nTimesamples + nTimesamples - 1];
 49 | 			}
 50 | 			else {
 51 | 				right = h_CPU_output_reduced[pos+1];
 52 | 			}
 53 | 			
 54 | 			result.x = (left.x - right.x)/(2.0*h);
 55 | 			result.y = (left.y - right.y)/(2.0*h);
 56 | 			h_CPU_postprocessed[pos] = result;
 57 | 		}
 58 | 	}
 59 | }
 60 | 
 61 | 
 62 | float get_error(float2 A_f2, float2 B_f2){
 63 | 	float error, div_error=10000, per_error=10000, order=0;
 64 | 	int power;
 65 | 	float A = max(A_f2.x, A_f2.y);
 66 | 	float B = max(B_f2.x, B_f2.y);
 67 | 	if(A<0) A = -A;
 68 | 	if(B<0) B = -B;
 69 | 	
 70 | 	if (A>B) {
 71 | 		div_error = A-B;
 72 | 		if(B>10){
 73 | 			power = (int) log10(B);
 74 | 			order = pow(10,power);
 75 | 			div_error = div_error/order;
 76 | 		}
 77 | 	}
 78 | 	else {
 79 | 		div_error = B-A;
 80 | 		if(A>10){
 81 | 			power = (int) log10(A);
 82 | 			order = pow(10,power);
 83 | 			div_error = div_error/order;
 84 | 		}
 85 | 	}
 86 | 	
 87 | 	if(div_error<per_error) error = div_error;
 88 | 	else error = per_error;
 89 | 	return(error);
 90 | }
 91 | 
 92 | int Compare_data(float2 *CPU_result, float2 *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
 93 | 	double total_error_l = 0, mean_error_l = 0;
 94 | 	size_t nErrors = 0;
 95 | 	int cislo = 0;
 96 | 	float error;
 97 | 	
 98 | 	for(int y=0; y<dim_y; y++){
 99 | 		for(int x=0; x<nSamples; x++){
100 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
101 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
102 | 			if((x + CPU_offset)<CPU_dim_x && (x + GPU_offset)<GPU_dim_x){
103 | 				float2 CPU, GPU;
104 | 				CPU.x = CPU_result[CPU_pos].x/CPU_scale; CPU.y = CPU_result[CPU_pos].y/CPU_scale;
105 | 				GPU.x = GPU_result[GPU_pos].x/GPU_scale; GPU.y = GPU_result[GPU_pos].y/GPU_scale;
106 | 				
107 | 				
108 | 				error = get_error(CPU, GPU);
109 | 				total_error_l = total_error_l + error;
110 | 				if( error > max_error ){
111 | 					nErrors++;
112 | 					if(cislo<40){
113 | 						printf("Error [%f] CPU [%f;%f] GPU [%f;%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU.x, CPU.y, GPU.x, GPU.y, x, y, (int) ((x + GPU_offset)/useful_part_size), (x + GPU_offset)%useful_part_size);
114 | 						cislo++;
115 | 					}
116 | 				}
117 | 			}
118 | 		}
119 | 	}
120 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
121 | 	(*total_error) = total_error_l;
122 | 	(*mean_error) = mean_error_l;
123 | 	return(nErrors);
124 | }
125 | 
126 | 
127 | void Full_CONV_check(float2 *GPU_result, float2 *h_input, float2 *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){
128 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
129 | 	float2 *h_CPU_output_timedomain;
130 | 	float2 *h_CPU_postprocessed;
131 | 	h_CPU_output_timedomain = (float2 *)malloc(output_size_timedomain*sizeof(float2));
132 | 	h_CPU_postprocessed     = (float2 *)malloc(output_size_timedomain*sizeof(float2));
133 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float2));
134 | 	memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float2));
135 | 	
136 | 	printf("\n--> Time-domain convolution:");
137 | 	CPU_time_domain(h_input, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters);
138 | 	
139 | 	printf("\n--> Post-processing:\n");
140 | 	CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h);
141 | 		
142 | 	float GPU_scale, CPU_scale;
143 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
144 | 
145 | 	#ifdef POST_PROCESS
146 | 	
147 | 	printf("\n--> Comparison to CPU time-domain with post-processing:\n");
148 | 	GPU_scale = 1.0;
149 | 	CPU_scale = 1.0;
150 | 	GPU_offset = 0;
151 | 	CPU_offset = 0;
152 | 	GPU_dim_x = nConvolutions*useful_part_size;
153 | 	CPU_dim_x = (signal_length + filter_length - 1);
154 | 	nSamples = signal_length - offset;	
155 | 	Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
156 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
157 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
158 | 	else printf("FAILED\n");	
159 | 	#else
160 | 	
161 | 	printf("\n--> Comparison to CPU time-domain:\n");
162 | 	GPU_scale = conv_length;
163 | 	GPU_scale = 1.0;
164 | 	CPU_scale = 1.0;
165 | 	GPU_offset = 0;
166 | 	CPU_offset = 0;
167 | 	GPU_dim_x = nConvolutions*useful_part_size;
168 | 	CPU_dim_x = (signal_length + filter_length - 1);
169 | 	nSamples = signal_length-offset;
170 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
171 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
172 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
173 | 	else printf("FAILED\n");
174 | 	
175 | 	#endif
176 | 	
177 | 	free(h_CPU_output_timedomain);
178 | 	free(h_CPU_postprocessed);
179 | }
180 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/debug.h:
--------------------------------------------------------------------------------
1 | #define VERBOSE true
2 | #define DEBUG false
3 | #define WRITE true
4 | #define CHECK true
5 | 
6 | #define DEVICEID 0
7 | //#define POST_PROCESS
8 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/params.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_C2C_sharedmemory_pp/params.h


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_C2C_sharedmemory_pp/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/CONV_R2R.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | 
  9 | #include "debug.h"
 10 | #include "params.h"
 11 | #include "results.h"
 12 | 
 13 | #include <stdio.h>
 14 | #include <string.h>
 15 | #include <math.h>
 16 | #include <time.h>
 17 | #include <stdlib.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_runtime_api.h>
 21 | 
 22 | #include "conv_check_R2R.h"
 23 | 
 24 | 
 25 | void Generate_signal(float *h_input, int signal_length){
 26 | 	for(int f=0; f<signal_length; f++){
 27 | 		h_input[f] = rand() / (float)RAND_MAX;
 28 | 	}
 29 | }
 30 | 
 31 | void Generate_templates(float *h_filters, int nSamples, int nFilters){
 32 | 	for(int t=0; t<nFilters; t++){
 33 | 		for(int f=0; f<nSamples; f++){
 34 | 			h_filters[t*nSamples + f] = rand() / (float)RAND_MAX;
 35 | 		}
 36 | 	}
 37 | }
 38 | 
 39 | void Pad_templates(float *h_filters_time, float *h_filters, int template_size, int convolution_size, int nFilters){
 40 | 	for(int f=0; f<nFilters*convolution_size; f++){
 41 | 		h_filters[f] = 0;
 42 | 	}
 43 | 	
 44 | 	for(int t=0; t<nFilters; t++){
 45 | 		for(int f=0; f<template_size; f++){
 46 | 			// padding for centered filter
 47 | 			if(f>=template_size/2) {
 48 | 				h_filters[t*convolution_size + f - template_size/2] = h_filters_time[t*template_size + f];
 49 | 			}
 50 | 			else if(f<template_size/2) {
 51 | 				h_filters[t*convolution_size + f + convolution_size - template_size/2] = h_filters_time[t*template_size + f];
 52 | 			}
 53 | 		}
 54 | 	}
 55 | }
 56 | 
 57 | 
 58 | int Write_output(float *h_output, int signal_length, int nFilters, char *output_signal_file){
 59 | 	int error=0;
 60 | 	ofstream FILEOUT;
 61 | 	FILEOUT.open(output_signal_file);
 62 | 	if (!FILEOUT.fail()){
 63 | 		if (VERBOSE) printf("Writing output\n");
 64 | 		for(int f=0; f<nFilters; f++){
 65 | 			if (VERBOSE) printf("[");
 66 | 			for(int Ts=0;Ts<signal_length;Ts++){
 67 | 				if(Ts%100000==0) {
 68 | 					if (VERBOSE) {
 69 | 						printf(".");
 70 | 						fflush(stdout);
 71 | 					}
 72 | 				}
 73 | 				FILEOUT << f << " " << Ts << " " << h_output[f*signal_length+Ts] << endl;
 74 | 			}
 75 | 			FILEOUT << endl;
 76 | 			if (VERBOSE) printf("] filter=%d\n",f);
 77 | 		}
 78 | 	}
 79 | 	else {
 80 | 		cout << "Write to a file failed!" << endl;
 81 | 		error++;
 82 | 	}
 83 | 	FILEOUT.close();
 84 | 	return(error);
 85 | }
 86 | 
 87 | 
 88 | long int File_size_row_signal(ifstream &FILEIN){
 89 | 	std::size_t count=0;
 90 | 	FILEIN.seekg(0,ios::beg);
 91 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
 92 | 	return((long int)count);
 93 | }
 94 | 
 95 | 
 96 | int Load_signal(char *filename, int *nSamples, float **data){
 97 | 	float real, imaginary;
 98 | 	int file_size, cislo, error;
 99 | 	error=0;
100 | 
101 | 	ifstream FILEIN;
102 | 	FILEIN.open(filename,ios::in);
103 | 	if (!FILEIN.fail()){
104 | 		error=0;
105 | 		file_size=File_size_row_signal(FILEIN);
106 | 		(*nSamples) = file_size;
107 | 		printf("nSamples:%d;\n", (*nSamples) );
108 | 
109 | 		if(file_size>0){
110 | 			*data = (float*)malloc(file_size*sizeof(float));
111 | 			memset( (*data), 0.0, file_size*sizeof(float));
112 | 			if(*data==NULL){
113 | 				printf("\nAllocation error!\n");
114 | 				error++;
115 | 			}
116 | 		
117 | 			FILEIN.clear();
118 | 			FILEIN.seekg(0,ios::beg);
119 | 			
120 | 			for (cislo = 0; cislo < file_size; cislo++) {
121 | 				FILEIN >> real >> imaginary;
122 | 				(*data)[cislo] = sqrt(real*real + imaginary*imaginary);
123 | 			}
124 | 		}
125 | 		else {
126 | 			printf("\nFile is void of any content!\n");
127 | 			error++;
128 | 		}
129 | 	}
130 | 	else {
131 | 		cout << "File not found -> " << filename << " <-" << endl;
132 | 		error++;
133 | 	}
134 | 	FILEIN.close();
135 | 	return(error);
136 | }
137 | 
138 | 
139 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){
140 | 	float real, imaginary;
141 | 	int file_size, cislo, error, filter_size;
142 | 	error=0;
143 | 
144 | 	ifstream FILEIN;
145 | 	FILEIN.open(filename,ios::in);
146 | 	if (!FILEIN.fail()){
147 | 		error=0;
148 | 		file_size = File_size_row_signal(FILEIN);
149 | 		(*filter_length) = file_size/(*nFilters);
150 | 		filter_size = (*nFilters)*(*filter_length);
151 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
152 | 
153 | 		if(file_size>0){
154 | 			*data = (float*)malloc( filter_size*sizeof(float));
155 | 			memset( (*data), 0.0, filter_size*sizeof(float));
156 | 			
157 | 			if(*data==NULL){
158 | 				printf("\nAllocation error!\n");
159 | 				error++;
160 | 			}
161 | 		
162 | 			FILEIN.clear();
163 | 			FILEIN.seekg(0,ios::beg);
164 | 
165 | 			for (cislo=0; cislo < filter_size; cislo++) {
166 | 				FILEIN >> real >> imaginary;
167 | 				(*data)[cislo] = real;
168 | 			}
169 | 		}
170 | 		else {
171 | 			printf("\nFile is void of any content!\n");
172 | 			error++;
173 | 		}
174 | 	}
175 | 	else {
176 | 		cout << "File not found -> " << filename << " <-" << endl;
177 | 		error++;
178 | 	}
179 | 	FILEIN.close();
180 | 	return(error);
181 | }
182 | 
183 | 
184 | int GPU_CONV(float *h_input, float *h_output, float *h_filters_timedom, int signal_length, int filter_length, int nFilters, int nRuns, double *execution_time);
185 | 
186 | 
187 | int main(int argc, char* argv[]) {
188 | 	int nTimesamples;		// input signal length
189 | 	int filter_length;		// filter length
190 | 	int nFilters;			// number of filters
191 | 	int nRuns;
192 | 	char input_type='0';
193 | 	char input_filter_file[255];
194 | 	char input_signal_file[255];
195 | 	char output_signal_file[255];
196 | 	
197 | 	char * pEnd;
198 | 	if (argc>2) {
199 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
200 | 		input_type=*argv[1];
201 | 	}
202 | 	if (input_type == 'f' && argc==6) {
203 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
204 | 		sprintf(input_signal_file,"%s",argv[2]);
205 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
206 | 		sprintf(input_filter_file,"%s",argv[3]);
207 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
208 | 		sprintf(output_signal_file,"%s",argv[4]);
209 | 		nFilters = strtol(argv[5],&pEnd,10);
210 | 		nRuns = 1;
211 | 	}
212 | 	else if (input_type == 'r' && argc==6) {	
213 | 		nTimesamples  = strtol(argv[2],&pEnd,10);
214 | 		filter_length = strtol(argv[3],&pEnd,10);
215 | 		nFilters      = strtol(argv[4],&pEnd,10);
216 | 		nRuns         = strtol(argv[5],&pEnd,10);
217 | 	}
218 | 	else {
219 | 		printf("Parameters error!\n");
220 | 		printf(" 1) Input type: 'r' or 'f' \n");
221 | 		printf("----------------------------------\n");
222 | 		printf("'f' - file input provided by user\n");
223 | 		printf(" 2) Input signal file\n");
224 | 		printf(" 3) Input filter file\n");
225 | 		printf(" 4) Output signal file\n");
226 | 		printf(" 5) number of filters\n");
227 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n");
228 | 		printf("----------------------------------\n");
229 | 		printf(" 'r' - random input generated by the code\n");
230 | 		printf(" 2) Signal length in number of time samples\n");
231 | 		printf(" 3) Filter length in samples\n");
232 | 		printf(" 4) Number of templates\n");
233 | 		printf(" 5) number of GPU kernel runs\n");
234 | 		printf(" Example: CONV.exe r 2097152 193 32 10\n");
235 |         return 1;
236 | 	}
237 | 	
238 | 	if (DEBUG) {
239 | 		printf("Parameters:\n");
240 | 		printf("Input signal and templates are ");
241 | 		if (input_type == 'r') {
242 | 			printf("randomly generated.\n");
243 | 			printf("Signal length:     %d samples\n", nTimesamples);
244 | 			printf("Filter length:     %d samples\n", filter_length);
245 | 			printf("Number of filters: %d\n", nFilters);
246 | 			printf("nRuns:             %d\n", nRuns);
247 | 		}
248 | 		if (input_type == 'f') {
249 | 			printf("read from file.\n");
250 | 			printf("Input signal:  %s\n", input_signal_file);
251 | 			printf("Input filter:  %s\n", input_filter_file);
252 | 			printf("Output signal: %s\n", output_signal_file);
253 | 			printf("nFilters:      %d\n", nFilters);
254 | 			printf("nRuns:         %d\n", nRuns);
255 | 			printf("-----------------\n");
256 | 		}
257 | 	}
258 | 
259 | 	float *h_input;			// input signal
260 | 	float *h_output;			// output plane
261 | 	float *h_filters_padded;	// filters in time-domain padded with zeroes
262 | 	float *h_filters;		    // filters in time-domain
263 | 	
264 | 	if (input_type == 'f') {
265 | 		int error=0;
266 | 		error += Load_signal(input_signal_file, &nTimesamples, &h_input);
267 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
268 | 		if( error>0 ){exit(1);}
269 | 		else if (VERBOSE) printf("File loaded\n");
270 | 	}
271 | 	
272 | 	if (input_type == 'r') {
273 | 		h_input          = (float *)malloc(nTimesamples*sizeof(float));
274 | 		h_filters	     = (float *)malloc(filter_length*nFilters*sizeof(float));
275 | 		srand(time(NULL));
276 | 		Generate_signal(h_input, nTimesamples);
277 | 		Generate_templates(h_filters, filter_length, nFilters);
278 | 		if (VERBOSE) printf("Signal and filters generated\n");
279 | 	}
280 | 	
281 | 	size_t filter_size_padded = nFilters*CONV_SIZE;
282 | 	h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float));
283 | 	Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters);
284 | 	
285 | 	//----------------> Results
286 | 	double execution_time = 0;
287 | 	Performance_results CONV_cuFFT;
288 | 	CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT");
289 | 	
290 | 	int offset           = filter_length/2; // we assume that filter is centered around zero
291 | 	int useful_part_size = CONV_SIZE - filter_length + 1;
292 | 	int nConvolutions    = (nTimesamples + useful_part_size - 1)/useful_part_size;
293 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
294 | 	if(DEBUG) {
295 | 		printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions);
296 | 	}
297 | 	
298 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
299 | 	h_output = (float*)malloc(output_size*sizeof(float));
300 | 	
301 | 	if (VERBOSE) printf("Convolution - cuFFT\n");
302 | 
303 | 	//----------------> GPU kernel
304 | 	int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, nRuns, &execution_time);
305 | 	CONV_cuFFT.GPU_time = execution_time;
306 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
307 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
308 | 	if(WRITE && GPU_error==0) CONV_cuFFT.Save();
309 | 	//----------------> GPU kernel
310 | 	
311 | 	if(CHECK){
312 | 		double total_error, mean_error;
313 | 		printf("Checking results...\n");
314 | 		Full_CONV_check(h_output, h_input, h_filters, nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, &total_error, &mean_error);
315 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
316 | 	}
317 | 	
318 | 	if (input_type == 'f') {
319 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
320 | 	}
321 | 	
322 | 	free(h_input);
323 | 	free(h_output);
324 | 	free(h_filters_padded);
325 | 	free(h_filters);
326 | 
327 | 	cudaDeviceReset();
328 | 
329 | 	if (VERBOSE) printf("Finished!\n");
330 | 
331 | 	return (0);
332 | }
333 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | ifdef reglim
18 | NVCCFLAGS += --maxrregcount=$(reglim)
19 | endif
20 | 
21 | all: clean analyze
22 | 
23 | analyze: CONV_R2R.o CONV-32bit_cuFFT.o Makefile
24 | 	$(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_R2R.o $(LIB) $(NVCCFLAGS) 
25 | 
26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h
27 | 	$(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64
28 | 
29 | CONV_R2R.o: CONV_R2R.cpp
30 | 	$(GCC) -c CONV_R2R.cpp $(GCC_OPTS)
31 | 
32 | clean:	
33 | 	rm -f *.o *.~ $(ANALYZE)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/When_cuFFT_wins_cuFFT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 	echo "#define CONV_SIZE $convsize" > params.h
 6 | 	
 7 | 	rm CONV.exe
 8 | 	make
 9 | 	for tempsize in {64..2048..32}
10 | 	do
11 | 		for templates in 32;
12 | 		do
13 | 			./CONV.exe r 2097152 $tempsize $templates 20 0
14 | 		done
15 | 	done
16 | done
17 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_cuFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_perf.dat;
7 | 
8 | ./When_cuFFT_wins_cuFFT.sh
9 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 
 6 | echo "#define CONV_SIZE $convsize" > params.h		
 7 | rm CONV.exe
 8 | make
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do	
11 | 		for templates in 2 4 8 11 16 32 51 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $templates 20
14 | 			./CONV.exe r 524288 $tempsize $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $templates 20
19 | 		done
20 | 	done
21 | done
22 | 
23 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/conv_check_R2R.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int nFilters){
 13 | 	for(int f=0; f<nFilters; f++){
 14 | 		printf(".");  fflush(stdout);
 15 | 		for(int s=0; s<signal_length; s++){
 16 | 			float ac;
 17 | 			ac = 0;
 18 | 			for(int i=0; i<filter_length; i++){
 19 | 				int filter_pos = filter_length - 1 - i;
 20 | 				float fv, sv;
 21 | 				fv = h_filters[f*filter_length + filter_pos];
 22 | 				int signal_pos = (s + i - (filter_length>>1));
 23 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 24 | 				else {sv = 0;}
 25 | 				ac = ac + sv*fv;
 26 | 			}
 27 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 28 | 		}
 29 | 	}
 30 | 	printf("\n");
 31 | }
 32 | 
 33 | float get_error(float A, float B){
 34 | 	float error, div_error=10000, per_error=10000, order=0;
 35 | 	int power;
 36 | 	if(A<0) A = -A;
 37 | 	if(B<0) B = -B;
 38 | 	
 39 | 	if (A>B) {
 40 | 		div_error = A-B;
 41 | 		if(B>10){
 42 | 			power = (int) log10(B);
 43 | 			order = pow(10,power);
 44 | 			div_error = div_error/order;
 45 | 		}
 46 | 	}
 47 | 	else {
 48 | 		div_error = B-A;
 49 | 		if(A>10){
 50 | 			power = (int) log10(A);
 51 | 			order = pow(10,power);
 52 | 			div_error = div_error/order;
 53 | 		}
 54 | 	}
 55 | 	
 56 | 	if(div_error<per_error) error = div_error;
 57 | 	else error = per_error;
 58 | 	return(error);
 59 | }
 60 | 
 61 | int Compare_data(float *CPU_result, float *GPU_result, int dim_x, int dim_y, int signal_length, int useful_part_size, double *total_error, double *mean_error){
 62 | 	double total_error_l = 0, mean_error_l = 0;
 63 | 	size_t nErrors = 0;
 64 | 	int cislo = 0;
 65 | 	float error;
 66 | 	
 67 | 	for(int y=0; y<dim_y; y++){
 68 | 		for(int x=0; x<signal_length; x++){
 69 | 			int pos = y*dim_x + x;
 70 | 			error = get_error(CPU_result[pos], GPU_result[pos]);
 71 | 			total_error_l = total_error_l + error;
 72 | 			if( error > max_error ){
 73 | 				nErrors++;
 74 | 				if(cislo<40){
 75 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size));
 76 | 					cislo++;
 77 | 				}
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 	mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y));
 82 | 	(*total_error) = total_error_l;
 83 | 	(*mean_error) = mean_error_l;
 84 | 	return(nErrors);
 85 | }
 86 | 
 87 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
 88 | 	double total_error_l = 0, mean_error_l = 0;
 89 | 	size_t nErrors = 0;
 90 | 	int cislo = 0;
 91 | 	float error;
 92 | 	
 93 | 	for(int y=0; y<dim_y; y++){
 94 | 		for(int x=0; x<nSamples; x++){
 95 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
 96 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
 97 | 			float CPU, GPU;
 98 | 			CPU = CPU_result[CPU_pos]/CPU_scale;
 99 | 			GPU = GPU_result[GPU_pos]/GPU_scale;
100 | 			
101 | 			
102 | 			error = get_error(CPU, GPU);
103 | 			total_error_l = total_error_l + error;
104 | 			if( error > max_error ){
105 | 				nErrors++;
106 | 				if(cislo<40){
107 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size);
108 | 					cislo++;
109 | 				}
110 | 			}
111 | 		}
112 | 	}
113 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
114 | 	(*total_error) = total_error_l;
115 | 	(*mean_error) = mean_error_l;
116 | 	return(nErrors);
117 | }
118 | 
119 | 
120 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){
121 | 	float GPU_scale, CPU_scale;
122 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
123 | 	
124 | 	//----------------------- CPU time-domain
125 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
126 | 	float *h_CPU_output_timedomain;
127 | 	h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float));
128 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float));
129 | 	
130 | 	printf("\n--> Time-domain convolution:");
131 | 	CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters);
132 | 	
133 | 	printf("\n--> Comparison to CPU time-domain:\n");
134 | 	GPU_scale = conv_length/2;
135 | 	CPU_scale = 1.0;
136 | 	GPU_offset = 0;
137 | 	CPU_offset = 0;
138 | 	GPU_dim_x = nConvolutions*useful_part_size;
139 | 	CPU_dim_x = (signal_length + filter_length - 1);
140 | 	nSamples = signal_length - offset;	
141 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
142 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
143 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
144 | 	else printf("FAILED\n");
145 | 	
146 | 	
147 | 	free(h_CPU_output_timedomain);
148 | 	//-------------------------------------------------<	
149 | }
150 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/debug.h:
--------------------------------------------------------------------------------
1 | #define VERBOSE true
2 | #define DEBUG false
3 | #define WRITE true
4 | #define CHECK false
5 | 
6 | #define DEVICEID 0
7 | 
8 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/params.h:
--------------------------------------------------------------------------------
1 | #define CONV_SIZE 8192
2 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/CONV_R2R.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | #include "debug.h"
  9 | #include "params.h"
 10 | #include "results.h"
 11 | 
 12 | #include <stdio.h>
 13 | #include <string.h>
 14 | #include <math.h>
 15 | #include <time.h>
 16 | #include <stdlib.h>
 17 | #include <cuda.h>
 18 | #include <cuda_runtime.h>
 19 | #include <cuda_runtime_api.h>
 20 | 
 21 | #include "conv_check_R2R.h"
 22 | 
 23 | 
 24 | void Generate_signal(float *h_input, int signal_length){
 25 | 	for(int f=0; f<signal_length; f++){
 26 | 		h_input[f] = rand() / (float)RAND_MAX;
 27 | 	}
 28 | }
 29 | 
 30 | void Generate_templates(float *h_filters, int nSamples, int nFilters){
 31 | 	for(int t=0; t<nFilters; t++){
 32 | 		for(int f=0; f<nSamples; f++){
 33 | 			h_filters[t*nSamples + f] = rand() / (float)RAND_MAX;
 34 | 		}
 35 | 	}
 36 | }
 37 | 
 38 | void Pad_templates(float *h_filters_time, float *h_filters, int template_size, int convolution_size, int nFilters){
 39 | 	for(int f=0; f<nFilters*convolution_size; f++){
 40 | 		h_filters[f] = 0;
 41 | 	}
 42 | 	
 43 | 	for(int t=0; t<nFilters; t++){
 44 | 		for(int f=0; f<template_size; f++){
 45 | 			// padding for centered filter
 46 | 			if(f>=template_size/2) {
 47 | 				h_filters[t*convolution_size + f - template_size/2] = h_filters_time[t*template_size + f];
 48 | 			}
 49 | 			else if(f<template_size/2) {
 50 | 				h_filters[t*convolution_size + f + convolution_size - template_size/2] = h_filters_time[t*template_size + f];
 51 | 			}
 52 | 		}
 53 | 	}
 54 | }
 55 | 
 56 | 
 57 | int Write_output(float *h_output, int signal_length, int nFilters, char *output_signal_file){
 58 | 	int error=0;
 59 | 	ofstream FILEOUT;
 60 | 	FILEOUT.open(output_signal_file);
 61 | 	if (!FILEOUT.fail()){
 62 | 		if (VERBOSE) printf("Writing output\n");
 63 | 		for(int f=0; f<nFilters; f++){
 64 | 			if (VERBOSE) printf("[");
 65 | 			for(int Ts=0;Ts<signal_length;Ts++){
 66 | 				if(Ts%100000==0) {
 67 | 					if (VERBOSE) {
 68 | 						printf(".");
 69 | 						fflush(stdout);
 70 | 					}
 71 | 				}
 72 | 				FILEOUT << f << " " << Ts << " " << h_output[f*signal_length+Ts] << endl;
 73 | 			}
 74 | 			FILEOUT << endl;
 75 | 			if (VERBOSE) printf("] filter=%d\n",f);
 76 | 		}
 77 | 	}
 78 | 	else {
 79 | 		cout << "Write to a file failed!" << endl;
 80 | 		error++;
 81 | 	}
 82 | 	FILEOUT.close();
 83 | 	return(error);
 84 | }
 85 | 
 86 | 
 87 | long int File_size_row_signal(ifstream &FILEIN){
 88 | 	std::size_t count=0;
 89 | 	FILEIN.seekg(0,ios::beg);
 90 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
 91 | 	return((long int)count);
 92 | }
 93 | 
 94 | 
 95 | int Load_signal(char *filename, int *nSamples, float **data){
 96 | 	float real, imaginary;
 97 | 	int file_size, cislo, error;
 98 | 	error=0;
 99 | 
100 | 	ifstream FILEIN;
101 | 	FILEIN.open(filename,ios::in);
102 | 	if (!FILEIN.fail()){
103 | 		error=0;
104 | 		file_size=File_size_row_signal(FILEIN);
105 | 		(*nSamples) = file_size;
106 | 		printf("nSamples:%d;\n", (*nSamples) );
107 | 
108 | 		if(file_size>0){
109 | 			*data = (float*)malloc(file_size*sizeof(float));
110 | 			memset( (*data), 0.0, file_size*sizeof(float));
111 | 			if(*data==NULL){
112 | 				printf("\nAllocation error!\n");
113 | 				error++;
114 | 			}
115 | 		
116 | 			FILEIN.clear();
117 | 			FILEIN.seekg(0,ios::beg);
118 | 			
119 | 			for (cislo = 0; cislo < file_size; cislo++) {
120 | 				FILEIN >> real >> imaginary;
121 | 				(*data)[cislo] = sqrt(real*real + imaginary*imaginary);
122 | 			}
123 | 		}
124 | 		else {
125 | 			printf("\nFile is void of any content!\n");
126 | 			error++;
127 | 		}
128 | 	}
129 | 	else {
130 | 		cout << "File not found -> " << filename << " <-" << endl;
131 | 		error++;
132 | 	}
133 | 	FILEIN.close();
134 | 	return(error);
135 | }
136 | 
137 | 
138 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){
139 | 	float real, imaginary;
140 | 	int file_size, cislo, error, filter_size;
141 | 	error=0;
142 | 
143 | 	ifstream FILEIN;
144 | 	FILEIN.open(filename,ios::in);
145 | 	if (!FILEIN.fail()){
146 | 		error=0;
147 | 		file_size = File_size_row_signal(FILEIN);
148 | 		(*filter_length) = file_size/(*nFilters);
149 | 		filter_size = (*nFilters)*(*filter_length);
150 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
151 | 
152 | 		if(file_size>0){
153 | 			*data = (float*)malloc( filter_size*sizeof(float));
154 | 			memset( (*data), 0.0, filter_size*sizeof(float));
155 | 			
156 | 			if(*data==NULL){
157 | 				printf("\nAllocation error!\n");
158 | 				error++;
159 | 			}
160 | 		
161 | 			FILEIN.clear();
162 | 			FILEIN.seekg(0,ios::beg);
163 | 
164 | 			for (cislo=0; cislo < filter_size; cislo++) {
165 | 				FILEIN >> real >> imaginary;
166 | 				(*data)[cislo] = real;
167 | 			}
168 | 		}
169 | 		else {
170 | 			printf("\nFile is void of any content!\n");
171 | 			error++;
172 | 		}
173 | 	}
174 | 	else {
175 | 		cout << "File not found -> " << filename << " <-" << endl;
176 | 		error++;
177 | 	}
178 | 	FILEIN.close();
179 | 	return(error);
180 | }
181 | 
182 | 
183 | int GPU_CONV(float *h_input, float *h_output, float *h_filters_timedom, int signal_length, int filter_length, int nFilters, int nRuns, float h, double *execution_time);
184 | 
185 | 
186 | int main(int argc, char* argv[]) {
187 | 	int nTimesamples;		// input signal length
188 | 	int filter_length;		// filter length
189 | 	int nFilters;			// number of filters
190 | 	int nRuns;
191 | 	char input_type='0';
192 | 	char input_filter_file[255];
193 | 	char input_signal_file[255];
194 | 	char output_signal_file[255];
195 | 	
196 | 	char * pEnd;
197 | 	if (argc>2) {
198 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
199 | 		input_type=*argv[1];
200 | 	}
201 | 	if (input_type == 'f' && argc==6) {
202 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
203 | 		sprintf(input_signal_file,"%s",argv[2]);
204 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
205 | 		sprintf(input_filter_file,"%s",argv[3]);
206 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
207 | 		sprintf(output_signal_file,"%s",argv[4]);
208 | 		nFilters = strtol(argv[5],&pEnd,10);
209 | 		nRuns = 1;
210 | 	}
211 | 	else if (input_type == 'r' && argc==6) {	
212 | 		nTimesamples  = strtol(argv[2],&pEnd,10);
213 | 		filter_length = strtol(argv[3],&pEnd,10);
214 | 		nFilters      = strtol(argv[4],&pEnd,10);
215 | 		nRuns         = strtol(argv[5],&pEnd,10);
216 | 	}
217 | 	else {
218 | 		printf("Parameters error!\n");
219 | 		printf(" 1) Input type: 'r' or 'f' \n");
220 | 		printf("----------------------------------\n");
221 | 		printf("'f' - file input provided by user\n");
222 | 		printf(" 2) Input signal file\n");
223 | 		printf(" 3) Input filter file\n");
224 | 		printf(" 4) Output signal file\n");
225 | 		printf(" 5) number of filters\n");
226 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 32\n");
227 | 		printf("----------------------------------\n");
228 | 		printf(" 'r' - random input generated by the code\n");
229 | 		printf(" 2) Signal length in number of time samples\n");
230 | 		printf(" 3) Filter length in samples\n");
231 | 		printf(" 4) Number of templates\n");
232 | 		printf(" 5) number of GPU kernel runs\n");
233 | 		printf(" Example: CONV.exe r 2097152 193 32 10\n");
234 |         return 1;
235 | 	}
236 | 	
237 | 	if (DEBUG) {
238 | 		printf("Parameters:\n");
239 | 		printf("Input signal and templates are ");
240 | 		if (input_type == 'r') {
241 | 			printf("randomly generated.\n");
242 | 			printf("Signal length:     %d samples\n", nTimesamples);
243 | 			printf("Filter length:     %d samples\n", filter_length);
244 | 			printf("Number of filters: %d\n", nFilters);
245 | 			printf("nRuns:             %d\n", nRuns);
246 | 		}
247 | 		if (input_type == 'f') {
248 | 			printf("read from file.\n");
249 | 			printf("Input signal:  %s\n", input_signal_file);
250 | 			printf("Input filter:  %s\n", input_filter_file);
251 | 			printf("Output signal: %s\n", output_signal_file);
252 | 			printf("nFilters:      %d\n", nFilters);
253 | 			printf("nRuns:         %d\n", nRuns);
254 | 			printf("-----------------\n");
255 | 		}
256 | 	}
257 | 
258 | 	float *h_input;			// input signal
259 | 	float *h_output;			// output plane
260 | 	float *h_filters_padded;	// filters in time-domain padded with zeroes
261 | 	float *h_filters;		    // filters in time-domain
262 | 	
263 | 	if (input_type == 'f') {
264 | 		int error=0;
265 | 		error += Load_signal(input_signal_file, &nTimesamples, &h_input);
266 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
267 | 		if( error>0 ){exit(1);}
268 | 		else if (VERBOSE) printf("File loaded\n");
269 | 	}
270 | 	
271 | 	if (input_type == 'r') {
272 | 		h_input          = (float *)malloc(nTimesamples*sizeof(float));
273 | 		h_filters	     = (float *)malloc(filter_length*nFilters*sizeof(float));
274 | 		srand(time(NULL));
275 | 		Generate_signal(h_input, nTimesamples);
276 | 		Generate_templates(h_filters, filter_length, nFilters);
277 | 		if (VERBOSE) printf("Signal and filters generated\n");
278 | 	}
279 | 	
280 | 	size_t filter_size_padded = nFilters*CONV_SIZE;
281 | 	h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float));
282 | 	Pad_templates(h_filters, h_filters_padded, filter_length, CONV_SIZE, nFilters);
283 | 	
284 | 	//----------------> Results
285 | 	double execution_time = 0;
286 | 	Performance_results CONV_cuFFT;
287 | 	CONV_cuFFT.Assign(nTimesamples, filter_length, nFilters, nRuns, 0, CONV_SIZE, nFilters, "CONV_cuFFT.dat", "cuFFT");
288 | 	
289 | 	int offset           = filter_length/2; // we assume that filter is centered around zero
290 | 	int useful_part_size = CONV_SIZE - filter_length + 1;
291 | 	int nConvolutions    = (nTimesamples + useful_part_size - 1)/useful_part_size;
292 | 
293 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
294 | 	if(DEBUG) {
295 | 		printf("offset=%d; useful_part_size=%d; nConvolutions=%d;\n", offset, useful_part_size, nConvolutions);
296 | 	}
297 | 	
298 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
299 | 	h_output = (float*)malloc(output_size*sizeof(float));
300 | 	
301 | 	if (VERBOSE) printf("Convolution - cuFFT\n");
302 | 
303 | 	//----------------> GPU kernel
304 | 	float h = 20.0;
305 | 	int GPU_error = GPU_CONV(h_input, h_output, h_filters_padded, nTimesamples, filter_length, nFilters, nRuns, h, &execution_time);
306 | 	CONV_cuFFT.GPU_time = execution_time;
307 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
308 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
309 | 	if(WRITE && GPU_error==0) CONV_cuFFT.Save();
310 | 	//----------------> GPU kernel
311 | 	
312 | 	if(CHECK){
313 | 		double total_error, mean_error;
314 | 		printf("Checking results...\n");
315 | 		Full_CONV_check(h_output, h_input, h_filters,  nTimesamples, filter_length, useful_part_size, (filter_length>>1), CONV_SIZE, nConvolutions, nFilters, h, &total_error, &mean_error);
316 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
317 | 	}
318 | 	
319 | 	if (input_type == 'f') {
320 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
321 | 	}
322 | 	
323 | 	free(h_input);
324 | 	free(h_output);
325 | 	free(h_filters_padded);
326 | 	free(h_filters);
327 | 
328 | 	cudaDeviceReset();
329 | 
330 | 	if (VERBOSE) printf("Finished!\n");
331 | 
332 | 	return (0);
333 | }
334 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft_static -lculibos -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | ifdef reglim
18 | NVCCFLAGS += --maxrregcount=$(reglim)
19 | endif
20 | 
21 | all: clean analyze
22 | 
23 | analyze: CONV_R2R.o CONV-32bit_cuFFT.o Makefile
24 | 	$(NVCC) -o $(ANALYZE) CONV-32bit_cuFFT.o CONV_R2R.o $(LIB) $(NVCCFLAGS) 
25 | 
26 | CONV-32bit_cuFFT.o: timer.h utils_cuda.h
27 | 	$(NVCC) -c CONV-32bit_cuFFT.cu $(NVCCFLAGS) -dc -m64
28 | 
29 | CONV_R2R.o: CONV_R2R.cpp
30 | 	$(GCC) -c CONV_R2R.cpp $(GCC_OPTS)
31 | 
32 | clean:	
33 | 	rm -f *.o *.~ $(ANALYZE)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/When_cuFFT_wins_cuFFT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 	echo "#define CONV_SIZE $convsize" > params.h
 6 | 	
 7 | 	rm CONV.exe
 8 | 	make
 9 | 	for tempsize in {64..4096..32}
10 | 	do
11 | 		for templates in 32;
12 | 		do
13 | 			./CONV.exe r 2097152 $tempsize $templates 20 0
14 | 		done
15 | 	done
16 | done
17 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_cuFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_pp_perf.dat;
7 | 
8 | ./When_cuFFT_wins_cuFFT.sh
9 | mv CONV_cuFFT.dat OLS_cuFFT_R2R_pp_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for convsize in 1024 2048 4096 8192 16384;
 4 | do
 5 | 
 6 | echo "#define CONV_SIZE $convsize" > params.h		
 7 | rm CONV.exe
 8 | make
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do	
11 | 		for templates in 2 4 8 11 16 32 51 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $templates 20
14 | 			./CONV.exe r 524288 $tempsize $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $templates 20
19 | 		done
20 | 	done
21 | done
22 | 
23 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/conv_check_R2R.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int nFilters){
 13 | 	for(int f=0; f<nFilters; f++){
 14 | 		printf(".");  fflush(stdout);
 15 | 		for(int s=0; s<signal_length; s++){
 16 | 			float ac;
 17 | 			ac = 0;
 18 | 			for(int i=0; i<filter_length; i++){
 19 | 				int filter_pos = filter_length - 1 - i;
 20 | 				float fv, sv;
 21 | 				fv = h_filters[f*filter_length + filter_pos];
 22 | 				int signal_pos = (s + i - (filter_length>>1));
 23 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 24 | 				else {sv = 0;}
 25 | 				ac = ac + sv*fv;
 26 | 			}
 27 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 28 | 		}
 29 | 	}
 30 | 	printf("\n");
 31 | }
 32 | 
 33 | 
 34 | void CPU_postprocess(float *h_CPU_postprocessed, float *h_CPU_output_reduced, int nTimesamples, int nFilters, float h){
 35 | 	float left, right, result;
 36 | 	
 37 | 	for(int f=0; f<nFilters; f++){
 38 | 		for(int s=0; s<nTimesamples; s++){
 39 | 			int pos = f*nTimesamples + s;
 40 | 			if( s==0 ) {
 41 | 				left = h_CPU_output_reduced[pos];
 42 | 			}
 43 | 			else {
 44 | 				left = h_CPU_output_reduced[pos-1];
 45 | 			}
 46 | 			
 47 | 			if( s==(nTimesamples-1) ) {
 48 | 				right = h_CPU_output_reduced[pos];
 49 | 			}
 50 | 			else {
 51 | 				right = h_CPU_output_reduced[pos+1];
 52 | 			}
 53 | 			
 54 | 			result = (left - right)/(2.0*h);
 55 | 			h_CPU_postprocessed[pos] = result;
 56 | 		}
 57 | 	}
 58 | }
 59 | 
 60 | 
 61 | float get_error(float A, float B){
 62 | 	float error, div_error=10000, per_error=10000, order=0;
 63 | 	int power;
 64 | 	if(A<0) A = -A;
 65 | 	if(B<0) B = -B;
 66 | 	
 67 | 	if (A>B) {
 68 | 		div_error = A-B;
 69 | 		if(B>10){
 70 | 			power = (int) log10(B);
 71 | 			order = pow(10,power);
 72 | 			div_error = div_error/order;
 73 | 		}
 74 | 	}
 75 | 	else {
 76 | 		div_error = B-A;
 77 | 		if(A>10){
 78 | 			power = (int) log10(A);
 79 | 			order = pow(10,power);
 80 | 			div_error = div_error/order;
 81 | 		}
 82 | 	}
 83 | 	
 84 | 	if(div_error<per_error) error = div_error;
 85 | 	else error = per_error;
 86 | 	return(error);
 87 | }
 88 | 
 89 | int Compare_data(float *CPU_result, float *GPU_result, int dim_x, int dim_y, int signal_length, int useful_part_size, double *total_error, double *mean_error){
 90 | 	double total_error_l = 0, mean_error_l = 0;
 91 | 	size_t nErrors = 0;
 92 | 	int cislo = 0;
 93 | 	float error;
 94 | 	
 95 | 	for(int y=0; y<dim_y; y++){
 96 | 		for(int x=0; x<signal_length; x++){
 97 | 			int pos = y*dim_x + x;
 98 | 			error = get_error(CPU_result[pos], GPU_result[pos]);
 99 | 			total_error_l = total_error_l + error;
100 | 			if( error > max_error ){
101 | 				nErrors++;
102 | 				if(cislo<40){
103 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; ratio=%f;\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size), CPU_result[pos]/GPU_result[pos]);
104 | 					cislo++;
105 | 				}
106 | 			}
107 | 		}
108 | 	}
109 | 	mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y));
110 | 	(*total_error) = total_error_l;
111 | 	(*mean_error) = mean_error_l;
112 | 	return(nErrors);
113 | }
114 | 
115 | 
116 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
117 | 	double total_error_l = 0, mean_error_l = 0;
118 | 	size_t nErrors = 0;
119 | 	int cislo = 0;
120 | 	float error;
121 | 	
122 | 	for(int y=0; y<dim_y; y++){
123 | 		for(int x=0; x<nSamples; x++){
124 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
125 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
126 | 			float CPU, GPU;
127 | 			CPU = CPU_result[CPU_pos]/CPU_scale;
128 | 			GPU = GPU_result[GPU_pos]/GPU_scale;
129 | 			
130 | 			
131 | 			error = get_error(CPU, GPU);
132 | 			total_error_l = total_error_l + error;
133 | 			if( error > max_error ){
134 | 				nErrors++;
135 | 				if(cislo<40){
136 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size);
137 | 					cislo++;
138 | 				}
139 | 			}
140 | 		}
141 | 	}
142 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
143 | 	(*total_error) = total_error_l;
144 | 	(*mean_error) = mean_error_l;
145 | 	return(nErrors);
146 | }
147 | 
148 | 
149 | 
150 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){
151 | 	
152 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
153 | 	float *h_CPU_output_timedomain;
154 | 	float *h_CPU_postprocessed;
155 | 	h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float));
156 | 	h_CPU_postprocessed = (float *)malloc(output_size_timedomain*sizeof(float));
157 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float));
158 | 	memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float));
159 | 	
160 | 	printf("\n--> Time-domain convolution:");
161 | 	CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, nFilters);
162 | 	CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h);
163 | 	
164 | 	float GPU_scale, CPU_scale;
165 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
166 | 	#ifdef POST_PROCESS
167 | 	
168 | 	printf("\n--> Comparison to CPU time-domain with post-processing:\n");
169 | 	GPU_scale = conv_length/2;
170 | 	CPU_scale = 1.0;
171 | 	GPU_offset = 0;
172 | 	CPU_offset = 0;
173 | 	GPU_dim_x = nConvolutions*useful_part_size;
174 | 	CPU_dim_x = (signal_length + filter_length - 1);
175 | 	nSamples = signal_length - offset;	
176 | 	Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
177 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
178 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
179 | 	else printf("FAILED\n");
180 | 	
181 | 	#else
182 | 	
183 | 	printf("\n--> Comparison to CPU time-domain:\n");
184 | 	GPU_scale = conv_length/2;
185 | 	CPU_scale = 1.0;
186 | 	GPU_offset = 0;
187 | 	CPU_offset = 0;
188 | 	GPU_dim_x = nConvolutions*useful_part_size;
189 | 	CPU_dim_x = (signal_length + filter_length - 1);
190 | 	nSamples = signal_length-offset;
191 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
192 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
193 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
194 | 	else printf("FAILED\n");
195 | 	
196 | 	#endif
197 | 	
198 | 	free(h_CPU_postprocessed);
199 | 	free(h_CPU_output_timedomain);
200 | }
201 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/debug.h:
--------------------------------------------------------------------------------
 1 | #define VERBOSE true
 2 | #define DEBUG false
 3 | #define WRITE true
 4 | #define CHECK false
 5 | 
 6 | #define DEVICEID 0
 7 | #define POST_PROCESS
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/params.h:
--------------------------------------------------------------------------------
1 | #define CONV_SIZE 8192
2 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_cuFFT_callbacks_pp/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/CONV_SM_OLS_R2R.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | #include "debug.h"
  9 | #include "params.h"
 10 | #include "results.h"
 11 | 
 12 | #include <stdio.h>
 13 | #include <string.h>
 14 | #include <math.h>
 15 | #include <time.h>
 16 | #include <stdlib.h>
 17 | #include <cuda.h>
 18 | #include <cuda_runtime.h>
 19 | #include <cuda_runtime_api.h>
 20 | 
 21 | #include "conv_check_R2R.h"
 22 | 
 23 | void Generate_signal(float *h_input, int signal_length){
 24 | 	for(int f=0; f<signal_length; f++){
 25 | 		h_input[f] = rand() / (float)RAND_MAX;
 26 | 	}
 27 | }
 28 | 
 29 | void Generate_random_filter(float *h_filters, int nSamples, int nFilters){
 30 | 	for(int t=0; t<nFilters; t++){
 31 | 		for(int f=0; f<nSamples; f++){
 32 | 			h_filters[t*nSamples + f] = rand() / (float)RAND_MAX;
 33 | 		}
 34 | 	}
 35 | }
 36 | 
 37 | void Pad_templates(float *h_filters_time, float *h_filters_padded, int filter_length, int corrected_filter_length, int convolution_size, int nFilters){
 38 | 	float *tmp_filter;
 39 | 	tmp_filter = new float[corrected_filter_length];
 40 | 	for(int f=0; f<nFilters*convolution_size; f++){
 41 | 		h_filters_padded[f] = 0;
 42 | 	}
 43 | 	
 44 | 	for(int t=0; t<nFilters; t++){
 45 | 		// copy to temporary filter
 46 | 		if(filter_length!=corrected_filter_length) {
 47 | 			tmp_filter[0] = 0;
 48 | 			for(int f=0; f<filter_length; f++){
 49 | 				tmp_filter[f + 1] = h_filters_time[t*filter_length + f];
 50 | 			}
 51 | 		}
 52 | 		else {
 53 | 			for(int f=0; f<filter_length; f++){
 54 | 				tmp_filter[f] = h_filters_time[t*filter_length + f];
 55 | 			}
 56 | 		}
 57 | 		
 58 | 		for(int f=0; f<corrected_filter_length; f++){
 59 | 			// padding for centered filter
 60 | 			if(f>=corrected_filter_length/2) {
 61 | 				h_filters_padded[t*convolution_size + f - corrected_filter_length/2] = tmp_filter[f];
 62 | 			}
 63 | 			else if(f<corrected_filter_length/2) {
 64 | 				h_filters_padded[t*convolution_size + f + convolution_size - corrected_filter_length/2] = tmp_filter[f];
 65 | 			}
 66 | 		}
 67 | 	}
 68 | }
 69 | 
 70 | 
 71 | int Write_output(float *h_output, int signal_length, int nFilters, char *output_signal_file){
 72 | 	int error=0;
 73 | 	ofstream FILEOUT;
 74 | 	FILEOUT.open(output_signal_file);
 75 | 	if (!FILEOUT.fail()){
 76 | 		if (VERBOSE) printf("Writing output\n");
 77 | 		for(int f=0; f<nFilters; f++){
 78 | 			if (VERBOSE) printf("[");
 79 | 			for(int Ts=0;Ts<signal_length;Ts++){
 80 | 				if(Ts%100000==0) {
 81 | 					if (VERBOSE) {
 82 | 						printf(".");
 83 | 						fflush(stdout);
 84 | 					}
 85 | 				}
 86 | 				FILEOUT << f << " " << Ts << " " << h_output[f*signal_length+Ts] << endl;
 87 | 			}
 88 | 			FILEOUT << endl;
 89 | 			if (VERBOSE) printf("] filter=%d\n",f);
 90 | 		}
 91 | 	}
 92 | 	else {
 93 | 		cout << "Write to a file failed!" << endl;
 94 | 		error++;
 95 | 	}
 96 | 	FILEOUT.close();
 97 | 	return(error);
 98 | }
 99 | 
100 | 
101 | long int File_size_row_signal(ifstream &FILEIN){
102 | 	std::size_t count=0;
103 | 	FILEIN.seekg(0,ios::beg);
104 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
105 | 	return((long int)count);
106 | }
107 | 
108 | 
109 | int Load_signal(char *filename, int *nSamples, float **data){
110 | 	float real, imaginary;
111 | 	int file_size, cislo, error;
112 | 	error=0;
113 | 
114 | 	ifstream FILEIN;
115 | 	FILEIN.open(filename,ios::in);
116 | 	if (!FILEIN.fail()){
117 | 		error=0;
118 | 		file_size=File_size_row_signal(FILEIN);
119 | 		(*nSamples) = file_size;
120 | 		printf("nSamples:%d;\n", (*nSamples) );
121 | 
122 | 		if(file_size>0){
123 | 			*data = (float*)malloc(file_size*sizeof(float));
124 | 			memset( (*data), 0.0, file_size*sizeof(float));
125 | 			if(*data==NULL){
126 | 				printf("\nAllocation error!\n");
127 | 				error++;
128 | 			}
129 | 		
130 | 			FILEIN.clear();
131 | 			FILEIN.seekg(0,ios::beg);
132 | 			
133 | 			for (cislo = 0; cislo < file_size; cislo++) {
134 | 				FILEIN >> real >> imaginary;
135 | 				(*data)[cislo] = sqrt(real*real + imaginary*imaginary);
136 | 			}
137 | 		}
138 | 		else {
139 | 			printf("\nFile is void of any content!\n");
140 | 			error++;
141 | 		}
142 | 	}
143 | 	else {
144 | 		cout << "File not found -> " << filename << " <-" << endl;
145 | 		error++;
146 | 	}
147 | 	FILEIN.close();
148 | 	return(error);
149 | }
150 | 
151 | 
152 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){
153 | 	float real, imaginary;
154 | 	int file_size, cislo, error, filter_size;
155 | 	error=0;
156 | 
157 | 	ifstream FILEIN;
158 | 	FILEIN.open(filename,ios::in);
159 | 	if (!FILEIN.fail()){
160 | 		error=0;
161 | 		file_size = File_size_row_signal(FILEIN);
162 | 		(*filter_length) = file_size/(*nFilters);
163 | 		filter_size = (*nFilters)*(*filter_length);
164 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
165 | 
166 | 		if(file_size>0){
167 | 			*data = (float*)malloc( filter_size*sizeof(float));
168 | 			memset( (*data), 0.0, filter_size*sizeof(float));
169 | 			
170 | 			if(*data==NULL){
171 | 				printf("\nAllocation error!\n");
172 | 				error++;
173 | 			}
174 | 		
175 | 			FILEIN.clear();
176 | 			FILEIN.seekg(0,ios::beg);
177 | 
178 | 			for (cislo=0; cislo < filter_size; cislo++) {
179 | 				FILEIN >> real >> imaginary;
180 | 				(*data)[cislo] = real;
181 | 			}
182 | 		}
183 | 		else {
184 | 			printf("\nFile is void of any content!\n");
185 | 			error++;
186 | 		}
187 | 	}
188 | 	else {
189 | 		cout << "File not found -> " << filename << " <-" << endl;
190 | 		error++;
191 | 	}
192 | 	FILEIN.close();
193 | 	return(error);
194 | }
195 | 
196 | 
197 | int GPU_convolution_OLS_customFFT(float *h_input_signal, float *h_output_plane, float *h_filters, int signal_length, int convolution_length, int filter_length, int past_filter_samples, int nFilters, int nRuns, int device, double *execution_time);
198 | 
199 | 
200 | int main(int argc, char* argv[]) {
201 | 	int signal_length;
202 | 	int filter_length;
203 | 	int past_filter_samples;
204 | 	int convolution_length;
205 | 	int nFilters;
206 | 	int nRuns;
207 | 	char input_type='0';
208 | 	char input_filter_file[255];
209 | 	char input_signal_file[255];
210 | 	char output_signal_file[255];
211 | 	
212 | 	char * pEnd;
213 | 	if (argc>2) {
214 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
215 | 		input_type=*argv[1];
216 | 	}
217 | 	if (input_type == 'f' && argc==8) {
218 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
219 | 		sprintf(input_signal_file,"%s",argv[2]);
220 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
221 | 		sprintf(input_filter_file,"%s",argv[3]);
222 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
223 | 		sprintf(output_signal_file,"%s",argv[4]);
224 | 		
225 | 		convolution_length = strtol(argv[5],&pEnd,10);
226 | 		nFilters = strtol(argv[6],&pEnd,10);
227 | 		past_filter_samples = strtol(argv[7],&pEnd,10);
228 | 		nRuns = 1;
229 | 	}
230 | 	else if (input_type == 'r' && argc==8) {
231 | 		signal_length  = strtol(argv[2],&pEnd,10);
232 | 		filter_length = strtol(argv[3],&pEnd,10);
233 | 		past_filter_samples = strtol(argv[4],&pEnd,10);
234 | 		convolution_length = strtol(argv[5],&pEnd,10);
235 | 		nFilters      = strtol(argv[6],&pEnd,10);
236 | 		
237 | 		nRuns = strtol(argv[7],&pEnd,10);
238 | 	}
239 | 	else {
240 | 		printf("Parameters error!\n");
241 | 		printf(" 1) Input type: 'r' or 'f' \n");
242 | 		printf("----------------------------------\n");
243 | 		printf("Parameters if input type is 'f' - file input provided by user\n");
244 | 		printf(" 2) Input signal file\n");
245 | 		printf(" 3) Input filter file\n");
246 | 		printf(" 4) Output signal file\n");
247 | 		printf(" 5) Convolution length in samples\n");
248 | 		printf(" 6) number of filters\n");
249 | 		printf(" 7) number of past samples in the filter.\n");
250 | 		printf("    for past filter (causal) it is (filter_length - 1)\n");
251 | 		printf("    for odd centered filter it is floor(filter_length/2)\n");
252 | 		printf("    for future filter it is 0\n");
253 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192\n");
254 | 		printf("----------------------------------\n");
255 | 		printf("Parameters if input type is 'r' - random input generated by the code\n");
256 | 		printf(" 2) Signal length in number of time samples\n");
257 | 		printf(" 3) Filter length in samples\n");
258 | 		printf(" 4) number of past samples in the filter.\n");
259 | 		printf("    for past filter (causal) it is (filter_length - 1)\n");
260 | 		printf("    for odd centered filter it is floor(filter_length/2)\n");
261 | 		printf("    for future filter it is 0\n");
262 | 		printf(" 5) Convolution length in samples\n");
263 | 		printf(" 6) Number of filters\n");
264 | 		printf(" 7) number of GPU kernel runs\n");
265 | 		printf(" Example: CONV.exe r 2097152 193 192 2048 32 10\n");
266 |         return 1;
267 | 	}
268 | 	
269 | 	if (DEBUG) {
270 | 		printf("Parameters:\n");
271 | 		printf("Input signal and filters are ");
272 | 		if (input_type == 'r') {
273 | 			printf("randomly generated.\n");
274 | 			printf("Signal length:      %d samples\n", signal_length);
275 | 			printf("Filter length:      %d samples\n", filter_length);
276 | 			printf("# of past samples:  %d samples\n", past_filter_samples);
277 | 			printf("Convolution length: %d samples\n", convolution_length);
278 | 			printf("Number of filters:  %d\n", nFilters);
279 | 			printf("nRuns:              %d\n", nRuns);
280 | 		}
281 | 		if (input_type == 'f') {
282 | 			printf("read from file.\n");
283 | 			printf("Input signal:       %s\n", input_signal_file);
284 | 			printf("Input filter:       %s\n", input_filter_file);
285 | 			printf("Output signal:      %s\n", output_signal_file);
286 | 			printf("Convolution length: %d samples\n", convolution_length);
287 | 			printf("nFilters:           %d\n", nFilters);
288 | 			printf("# of past samples:  %d samples\n", past_filter_samples);
289 | 			printf("nRuns:              %d\n", nRuns);
290 | 			printf("-----------------\n");
291 | 		}
292 | 	}
293 | 
294 | 	float *h_input;
295 | 	float *h_output;
296 | 	float *h_filters;		    // filters in time-domain
297 | 	float *h_filters_padded;	// filters in time-domain padded with zeroes
298 | 	
299 | 	if (input_type == 'f') {
300 | 		int error=0;
301 | 		error += Load_signal(input_signal_file, &signal_length, &h_input);
302 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
303 | 		if( error>0 ){exit(1);}
304 | 		else if (VERBOSE) printf("File loaded\n");
305 | 	}
306 | 
307 | 	//----------------> Results
308 | 	double execution_time = 0;
309 | 	Performance_results CONV_cuFFT;
310 | 	CONV_cuFFT.Assign(signal_length, filter_length, nFilters, nRuns, 0, convolution_length, nFilters, "CONV_R2R_kFFT.dat", "one");
311 | 	
312 | 	int corrected_filter_length;
313 | 	if( filter_length%2==0 ) corrected_filter_length = filter_length + 1;
314 | 	else corrected_filter_length = filter_length;
315 | 	int useful_part_size = convolution_length - corrected_filter_length + 1;
316 | 	useful_part_size = 2*(useful_part_size>>1);
317 | 	int nConvolutions    = (signal_length + useful_part_size - 1)/useful_part_size;
318 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
319 | 
320 | 	
321 | 	if (input_type == 'r') {
322 | 		h_input          = (float *)malloc(signal_length*sizeof(float));
323 | 		h_filters	     = (float *)malloc(filter_length*nFilters*sizeof(float));
324 | 		srand(time(NULL));
325 | 		Generate_signal(h_input, signal_length);
326 | 		Generate_random_filter(h_filters, filter_length, nFilters);
327 | 		if (VERBOSE) printf("Signal and filters generated\n");
328 | 	}
329 | 	
330 | 	size_t filter_size_padded = nFilters*convolution_length;
331 | 	h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float));
332 | 	Pad_templates(h_filters, h_filters_padded, filter_length, corrected_filter_length,  convolution_length, nFilters);
333 | 	
334 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
335 | 	h_output = (float*)malloc(output_size*sizeof(float));
336 | 	
337 | 	if (VERBOSE) printf("Convolution - kFFT\n");
338 | 
339 | 	//----------------> GPU kernel
340 | 	GPU_convolution_OLS_customFFT(h_input, h_output, h_filters_padded, signal_length, convolution_length, corrected_filter_length, past_filter_samples, nFilters, nRuns, DEVICEID, &execution_time);
341 | 	CONV_cuFFT.GPU_time = execution_time;
342 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
343 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
344 | 	if(WRITE) CONV_cuFFT.Save();
345 | 	//----------------> GPU kernel
346 | 	
347 | 	if(CHECK){
348 | 		double total_error, mean_error;
349 | 		printf("Checking results...\n");
350 | 		Full_CONV_check(h_output, h_input, h_filters, signal_length, filter_length, past_filter_samples, useful_part_size, (filter_length>>1), convolution_length, nConvolutions, nFilters, &total_error, &mean_error);
351 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
352 | 	}
353 | 	
354 | 	if (input_type == 'f') {
355 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
356 | 	}
357 | 	
358 | 	free(h_input);
359 | 	free(h_output);
360 | 	free(h_filters_padded);
361 | 	free(h_filters);
362 | 
363 | 	cudaDeviceReset();
364 | 
365 | 	if (VERBOSE) printf("Finished!\n");
366 | 
367 | 	return (0);
368 | }
369 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | 
18 | ifdef reglim
19 | NVCCFLAGS += --maxrregcount=$(reglim)
20 | endif
21 | 
22 | all: clean onefilter
23 | 
24 | onefilter: CONV_SM_OLS_R2R.o  CONV-32bit_customFFT.o Makefile
25 | 	$(NVCC) -o CONV.exe CONV_SM_OLS_R2R.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS)
26 | 
27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h
28 | 	$(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS)
29 | 
30 | CONV_SM_OLS_R2R.o: CONV_SM_OLS_R2R.cpp
31 | 	$(GCC) -c CONV_SM_OLS_R2R.cpp $(GCC_OPTS)
32 | 	
33 | clean:	
34 | 	rm -f *.o *.~ CONV.exe
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/When_cuFFT_wins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm CONV_1f.exe;
 4 | rm CONV_2f.exe;
 5 | rm *.o;
 6 | make reglim=0 > /dev/null 2>&1
 7 | for convlength in 256 512 1024 2048 4096;
 8 | do 
 9 | 	for tempsize in {64..4096..32}
10 | 	do
11 | 		./CONV.exe r 2097152 $tempsize $convlength 32 20
12 | 	done
13 | done
14 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_R2R_kFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_perf.dat;
7 | 
8 | ./When_cuFFT_wins.sh
9 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | rm CONV.exe;
 5 | rm *.o;
 6 | make reglim=$reg > /dev/null 2>&1
 7 | for convlength in 512 1024 2048 4096
 8 | do
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do
11 | 		for templates in 2 4 8 16 32 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $convlength $templates 20
14 | 			./CONV.exe r 524288 $tempsize $convlength $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $convlength $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $convlength $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $convlength $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $convlength $templates 20
19 | 		done
20 | 	done
21 | done
22 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/conv_check_R2R.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){
 13 | 	for(int f=0; f<nFilters; f++){
 14 | 		printf(".");  fflush(stdout);
 15 | 		for(int s=0; s<signal_length; s++){
 16 | 			float ac;
 17 | 			ac = 0;
 18 | 			for(int i=0; i<filter_length; i++){
 19 | 				int filter_pos = filter_length - 1 - i;
 20 | 				float fv, sv;
 21 | 				fv = h_filters[f*filter_length + filter_pos];
 22 | 				int signal_pos = (s + i - past_filter_samples);
 23 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 24 | 				else {sv = 0;}
 25 | 				ac = ac + sv*fv;
 26 | 			}
 27 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 28 | 		}
 29 | 	}
 30 | 	printf("\n");
 31 | }
 32 | 
 33 | float get_error(float A, float B){
 34 | 	float error, div_error=10000, per_error=10000, order=0;
 35 | 	int power;
 36 | 	if(A<0) A = -A;
 37 | 	if(B<0) B = -B;
 38 | 	
 39 | 	if (A>B) {
 40 | 		div_error = A-B;
 41 | 		if(B>10){
 42 | 			power = (int) log10(B);
 43 | 			order = pow(10,power);
 44 | 			div_error = div_error/order;
 45 | 		}
 46 | 	}
 47 | 	else {
 48 | 		div_error = B-A;
 49 | 		if(A>10){
 50 | 			power = (int) log10(A);
 51 | 			order = pow(10,power);
 52 | 			div_error = div_error/order;
 53 | 		}
 54 | 	}
 55 | 	
 56 | 	if(div_error<per_error) error = div_error;
 57 | 	else error = per_error;
 58 | 	return(error);
 59 | }
 60 | 
 61 | int Compare_data(float *CPU_result, float *GPU_result, int dim_x, int dim_y, int signal_length, int useful_part_size, double *total_error, double *mean_error){
 62 | 	double total_error_l = 0, mean_error_l = 0;
 63 | 	size_t nErrors = 0;
 64 | 	int cislo = 0;
 65 | 	float error;
 66 | 	
 67 | 	for(int y=0; y<dim_y; y++){
 68 | 		for(int x=0; x<signal_length; x++){
 69 | 			int pos = y*dim_x + x;
 70 | 			error = get_error(CPU_result[pos], GPU_result[pos]);
 71 | 			total_error_l = total_error_l + error;
 72 | 			if( error > max_error ){
 73 | 				nErrors++;
 74 | 				if(cislo<40){
 75 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size));
 76 | 					cislo++;
 77 | 				}
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 	mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y));
 82 | 	(*total_error) = total_error_l;
 83 | 	(*mean_error) = mean_error_l;
 84 | 	return(nErrors);
 85 | }
 86 | 
 87 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
 88 | 	double total_error_l = 0, mean_error_l = 0;
 89 | 	size_t nErrors = 0;
 90 | 	int cislo = 0;
 91 | 	float error;
 92 | 	
 93 | 	for(int y=0; y<dim_y; y++){
 94 | 		for(int x=0; x<nSamples; x++){
 95 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
 96 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
 97 | 			float CPU, GPU;
 98 | 			CPU = CPU_result[CPU_pos]/CPU_scale;
 99 | 			GPU = GPU_result[GPU_pos]/GPU_scale;
100 | 			
101 | 			
102 | 			error = get_error(CPU, GPU);
103 | 			total_error_l = total_error_l + error;
104 | 			if( error > max_error ){
105 | 				nErrors++;
106 | 				if(cislo<40){
107 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size);
108 | 					cislo++;
109 | 				}
110 | 			}
111 | 		}
112 | 	}
113 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
114 | 	(*total_error) = total_error_l;
115 | 	(*mean_error) = mean_error_l;
116 | 	return(nErrors);
117 | }
118 | 
119 | 
120 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, double *cumulative_error, double *mean_error){
121 | 	float GPU_scale, CPU_scale;
122 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
123 | 	
124 | 	
125 | 	//----------------------- CPU time-domain
126 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
127 | 	float *h_CPU_output_timedomain;
128 | 	h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float));
129 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float));
130 | 	
131 | 	CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters);
132 | 	
133 | 	printf("\n--> Comparison to CPU time-domain:\n");
134 | 	GPU_scale = conv_length/2;
135 | 	GPU_scale = 1.0;
136 | 	CPU_scale = 1.0;
137 | 	GPU_offset = 0;
138 | 	CPU_offset = 0;
139 | 	GPU_dim_x = nConvolutions*useful_part_size;
140 | 	CPU_dim_x = (signal_length + filter_length - 1);
141 | 	nSamples = signal_length - offset;	
142 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
143 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
144 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
145 | 	else printf("FAILED\n");
146 | 	
147 | 	
148 | 	free(h_CPU_output_timedomain);
149 | 	//-------------------------------------------------<
150 | }
151 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/debug.h:
--------------------------------------------------------------------------------
1 | #define VERBOSE true
2 | #define DEBUG false
3 | #define CHECK true
4 | #define WRITE true
5 | 
6 | #define DEVICEID 0
7 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/params.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_R2R_sharedmemory/params.h


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/CONV_SM_OLS_R2R.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2019  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | #include "debug.h"
  9 | #include "params.h"
 10 | #include "results.h"
 11 | 
 12 | #include <stdio.h>
 13 | #include <string.h>
 14 | #include <math.h>
 15 | #include <time.h>
 16 | #include <stdlib.h>
 17 | #include <cuda.h>
 18 | #include <cuda_runtime.h>
 19 | #include <cuda_runtime_api.h>
 20 | 
 21 | #include "conv_check_R2R.h"
 22 | 
 23 | void Generate_signal(float *h_input, int signal_length){
 24 | 	for(int f=0; f<signal_length; f++){
 25 | 		h_input[f] = rand() / (float)RAND_MAX;
 26 | 	}
 27 | }
 28 | 
 29 | void Generate_random_filter(float *h_filters, int nSamples, int nFilters){
 30 | 	for(int t=0; t<nFilters; t++){
 31 | 		for(int f=0; f<nSamples; f++){
 32 | 			h_filters[t*nSamples + f] = rand() / (float)RAND_MAX;
 33 | 		}
 34 | 	}
 35 | }
 36 | 
 37 | void Pad_templates(float *h_filters_time, float *h_filters_padded, int filter_length, int corrected_filter_length, int convolution_size, int nFilters){
 38 | 	float *tmp_filter;
 39 | 	tmp_filter = new float[corrected_filter_length];
 40 | 	for(int f=0; f<nFilters*convolution_size; f++){
 41 | 		h_filters_padded[f] = 0;
 42 | 	}
 43 | 	
 44 | 	for(int t=0; t<nFilters; t++){
 45 | 		// copy to temporary filter
 46 | 		if(filter_length!=corrected_filter_length) {
 47 | 			tmp_filter[0] = 0;
 48 | 			for(int f=0; f<filter_length; f++){
 49 | 				tmp_filter[f + 1] = h_filters_time[t*filter_length + f];
 50 | 			}
 51 | 		}
 52 | 		else {
 53 | 			for(int f=0; f<filter_length; f++){
 54 | 				tmp_filter[f] = h_filters_time[t*filter_length + f];
 55 | 			}
 56 | 		}
 57 | 		
 58 | 		for(int f=0; f<corrected_filter_length; f++){
 59 | 			// padding for centered filter
 60 | 			if(f>=corrected_filter_length/2) {
 61 | 				h_filters_padded[t*convolution_size + f - corrected_filter_length/2] = tmp_filter[f];
 62 | 			}
 63 | 			else if(f<corrected_filter_length/2) {
 64 | 				h_filters_padded[t*convolution_size + f + convolution_size - corrected_filter_length/2] = tmp_filter[f];
 65 | 			}
 66 | 		}
 67 | 	}
 68 | }
 69 | 
 70 | 
 71 | int Write_output(float *h_output, int signal_length, int nFilters, char *output_signal_file){
 72 | 	int error=0;
 73 | 	ofstream FILEOUT;
 74 | 	FILEOUT.open(output_signal_file);
 75 | 	if (!FILEOUT.fail()){
 76 | 		if (VERBOSE) printf("Writing output\n");
 77 | 		for(int f=0; f<nFilters; f++){
 78 | 			if (VERBOSE) printf("[");
 79 | 			for(int Ts=0;Ts<signal_length;Ts++){
 80 | 				if(Ts%100000==0) {
 81 | 					if (VERBOSE) {
 82 | 						printf(".");
 83 | 						fflush(stdout);
 84 | 					}
 85 | 				}
 86 | 				FILEOUT << f << " " << Ts << " " << h_output[f*signal_length+Ts] << endl;
 87 | 			}
 88 | 			FILEOUT << endl;
 89 | 			if (VERBOSE) printf("] filter=%d\n",f);
 90 | 		}
 91 | 	}
 92 | 	else {
 93 | 		cout << "Write to a file failed!" << endl;
 94 | 		error++;
 95 | 	}
 96 | 	FILEOUT.close();
 97 | 	return(error);
 98 | }
 99 | 
100 | 
101 | long int File_size_row_signal(ifstream &FILEIN){
102 | 	std::size_t count=0;
103 | 	FILEIN.seekg(0,ios::beg);
104 | 	for(std::string line; std::getline(FILEIN, line); ++count){}
105 | 	return((long int)count);
106 | }
107 | 
108 | 
109 | int Load_signal(char *filename, int *nSamples, float **data){
110 | 	float real, imaginary;
111 | 	int file_size, cislo, error;
112 | 	error=0;
113 | 
114 | 	ifstream FILEIN;
115 | 	FILEIN.open(filename,ios::in);
116 | 	if (!FILEIN.fail()){
117 | 		error=0;
118 | 		file_size=File_size_row_signal(FILEIN);
119 | 		(*nSamples) = file_size;
120 | 		printf("nSamples:%d;\n", (*nSamples) );
121 | 
122 | 		if(file_size>0){
123 | 			*data = (float*)malloc(file_size*sizeof(float));
124 | 			memset( (*data), 0.0, file_size*sizeof(float));
125 | 			if(*data==NULL){
126 | 				printf("\nAllocation error!\n");
127 | 				error++;
128 | 			}
129 | 		
130 | 			FILEIN.clear();
131 | 			FILEIN.seekg(0,ios::beg);
132 | 			
133 | 			for (cislo = 0; cislo < file_size; cislo++) {
134 | 				FILEIN >> real >> imaginary;
135 | 				(*data)[cislo] = sqrt(real*real + imaginary*imaginary);
136 | 			}
137 | 		}
138 | 		else {
139 | 			printf("\nFile is void of any content!\n");
140 | 			error++;
141 | 		}
142 | 	}
143 | 	else {
144 | 		cout << "File not found -> " << filename << " <-" << endl;
145 | 		error++;
146 | 	}
147 | 	FILEIN.close();
148 | 	return(error);
149 | }
150 | 
151 | 
152 | int Load_filters(char *filename, int *nFilters, int *filter_length, float **data){
153 | 	float real, imaginary;
154 | 	int file_size, cislo, error, filter_size;
155 | 	error=0;
156 | 
157 | 	ifstream FILEIN;
158 | 	FILEIN.open(filename,ios::in);
159 | 	if (!FILEIN.fail()){
160 | 		error=0;
161 | 		file_size = File_size_row_signal(FILEIN);
162 | 		(*filter_length) = file_size/(*nFilters);
163 | 		filter_size = (*nFilters)*(*filter_length);
164 | 		printf("filter_length:%d; file_size:%d; filter_size:%d;\n", (*filter_length), file_size, filter_size);
165 | 
166 | 		if(file_size>0){
167 | 			*data = (float*)malloc( filter_size*sizeof(float));
168 | 			memset( (*data), 0.0, filter_size*sizeof(float));
169 | 			
170 | 			if(*data==NULL){
171 | 				printf("\nAllocation error!\n");
172 | 				error++;
173 | 			}
174 | 		
175 | 			FILEIN.clear();
176 | 			FILEIN.seekg(0,ios::beg);
177 | 
178 | 			for (cislo=0; cislo < filter_size; cislo++) {
179 | 				FILEIN >> real >> imaginary;
180 | 				(*data)[cislo] = real;
181 | 			}
182 | 		}
183 | 		else {
184 | 			printf("\nFile is void of any content!\n");
185 | 			error++;
186 | 		}
187 | 	}
188 | 	else {
189 | 		cout << "File not found -> " << filename << " <-" << endl;
190 | 		error++;
191 | 	}
192 | 	FILEIN.close();
193 | 	return(error);
194 | }
195 | 
196 | 
197 | int GPU_convolution_OLS_customFFT(float *h_input_signal, float *h_output_plane, float *h_filters, int signal_length, int convolution_length, int filter_length, int past_filter_samples, int nFilters, int nRuns, float h, int offset_modifier, int device, double *execution_time);
198 | 
199 | 
200 | int main(int argc, char* argv[]) {
201 | 	int signal_length;
202 | 	int filter_length;
203 | 	int past_filter_samples;
204 | 	int convolution_length;
205 | 	int nFilters;
206 | 	int nRuns;
207 | 	char input_type='0';
208 | 	char input_filter_file[255];
209 | 	char input_signal_file[255];
210 | 	char output_signal_file[255];
211 | 	
212 | 	char * pEnd;
213 | 	if (argc>2) {
214 | 		if (strlen(argv[1])!=1) {printf("Specify input: \n'r' - random input generated by the code\n 'f' - file input provided by user\n"); exit(2);}
215 | 		input_type=*argv[1];
216 | 	}
217 | 	if (input_type == 'f' && argc==8) {
218 | 		if (strlen(argv[2])>255) {printf("Filename of input signal file is too long\n"); exit(2);}
219 | 		sprintf(input_signal_file,"%s",argv[2]);
220 | 		if (strlen(argv[3])>255) {printf("Filename of input filter file is too long\n"); exit(2);}
221 | 		sprintf(input_filter_file,"%s",argv[3]);
222 | 		if (strlen(argv[4])>255) {printf("Filename of output signal file is too long\n"); exit(2);}
223 | 		sprintf(output_signal_file,"%s",argv[4]);
224 | 		
225 | 		convolution_length = strtol(argv[5],&pEnd,10);
226 | 		nFilters = strtol(argv[6],&pEnd,10);
227 | 		past_filter_samples = strtol(argv[7],&pEnd,10);
228 | 		nRuns = 1;
229 | 	}
230 | 	else if (input_type == 'r' && argc==8) {
231 | 		signal_length  = strtol(argv[2],&pEnd,10);
232 | 		filter_length = strtol(argv[3],&pEnd,10);
233 | 		past_filter_samples = strtol(argv[4],&pEnd,10);
234 | 		convolution_length = strtol(argv[5],&pEnd,10);
235 | 		nFilters      = strtol(argv[6],&pEnd,10);
236 | 		
237 | 		nRuns = strtol(argv[7],&pEnd,10);
238 | 	}
239 | 	else {
240 | 		printf("Parameters error!\n");
241 | 		printf(" 1) Input type: 'r' or 'f' \n");
242 | 		printf("----------------------------------\n");
243 | 		printf("Parameters if input type is 'f' - file input provided by user\n");
244 | 		printf(" 2) Input signal file\n");
245 | 		printf(" 3) Input filter file\n");
246 | 		printf(" 4) Output signal file\n");
247 | 		printf(" 5) Convolution length in samples\n");
248 | 		printf(" 6) number of filters\n");
249 | 		printf(" 7) number of past samples in the filter.\n");
250 | 		printf("    for past filter (causal) it is (filter_length - 1)\n");
251 | 		printf("    for odd centered filter it is floor(filter_length/2)\n");
252 | 		printf("    for future filter it is 0\n");
253 | 		printf(" Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192\n");
254 | 		printf("----------------------------------\n");
255 | 		printf("Parameters if input type is 'r' - random input generated by the code\n");
256 | 		printf(" 2) Signal length in number of time samples\n");
257 | 		printf(" 3) Filter length in samples\n");
258 | 		printf(" 4) number of past samples in the filter.\n");
259 | 		printf("    for past filter (causal) it is (filter_length - 1)\n");
260 | 		printf("    for odd centered filter it is floor(filter_length/2)\n");
261 | 		printf("    for future filter it is 0\n");
262 | 		printf(" 5) Convolution length in samples\n");
263 | 		printf(" 6) Number of filters\n");
264 | 		printf(" 7) number of GPU kernel runs\n");
265 | 		printf(" Example: CONV.exe r 2097152 193 192 2048 32 10\n");
266 |         return 1;
267 | 	}
268 | 	
269 | 	if (DEBUG) {
270 | 		printf("Parameters:\n");
271 | 		printf("Input signal and filters are ");
272 | 		if (input_type == 'r') {
273 | 			printf("randomly generated.\n");
274 | 			printf("Signal length:      %d samples\n", signal_length);
275 | 			printf("Filter length:      %d samples\n", filter_length);
276 | 			printf("# of past samples:  %d samples\n", past_filter_samples);
277 | 			printf("Convolution length: %d samples\n", convolution_length);
278 | 			printf("Number of filters:  %d\n", nFilters);
279 | 			printf("nRuns:              %d\n", nRuns);
280 | 		}
281 | 		if (input_type == 'f') {
282 | 			printf("read from file.\n");
283 | 			printf("Input signal:       %s\n", input_signal_file);
284 | 			printf("Input filter:       %s\n", input_filter_file);
285 | 			printf("Output signal:      %s\n", output_signal_file);
286 | 			printf("Convolution length: %d samples\n", convolution_length);
287 | 			printf("nFilters:           %d\n", nFilters);
288 | 			printf("# of past samples:  %d samples\n", past_filter_samples);
289 | 			printf("nRuns:              %d\n", nRuns);
290 | 			printf("-----------------\n");
291 | 		}
292 | 	}
293 | 	
294 | 	#ifdef POST_PROCESS
295 | 	float h=20.0;
296 | 	int offset_modifier = 4;
297 | 	#else
298 | 	float h=1.0;
299 | 	int offset_modifier = 0;
300 | 	#endif
301 | 
302 | 	float *h_input;
303 | 	float *h_output;
304 | 	float *h_filters;		    // filters in time-domain
305 | 	float *h_filters_padded;	// filters in time-domain padded with zeroes
306 | 	
307 | 	if (input_type == 'f') {
308 | 		int error=0;
309 | 		error += Load_signal(input_signal_file, &signal_length, &h_input);
310 | 		error += Load_filters(input_filter_file, &nFilters, &filter_length, &h_filters);
311 | 		if( error>0 ){exit(1);}
312 | 		else if (VERBOSE) printf("File loaded\n");
313 | 	}
314 | 
315 | 	//----------------> Results
316 | 	double execution_time = 0;
317 | 	Performance_results CONV_cuFFT;
318 | 	CONV_cuFFT.Assign(signal_length, filter_length, nFilters, nRuns, 0, convolution_length, nFilters, "CONV_R2R_kFFT.dat", "one");
319 | 	
320 | 	int corrected_filter_length;
321 | 	if( filter_length%2==0 ) corrected_filter_length = filter_length + 1;
322 | 	else corrected_filter_length = filter_length;
323 | 	int useful_part_size = convolution_length - (corrected_filter_length + offset_modifier) + 1;
324 | 	useful_part_size = 2*(useful_part_size>>1);
325 | 	int nConvolutions    = (signal_length + useful_part_size - 1)/useful_part_size;
326 | 	if( useful_part_size<=1) {printf("Filter length is too long. Increase FFT length.\n");exit(1);}
327 | 
328 | 	
329 | 	if (input_type == 'r') {
330 | 		h_input          = (float *)malloc(signal_length*sizeof(float));
331 | 		h_filters	     = (float *)malloc(filter_length*nFilters*sizeof(float));
332 | 		srand(time(NULL));
333 | 		Generate_signal(h_input, signal_length);
334 | 		Generate_random_filter(h_filters, filter_length, nFilters);
335 | 		if (VERBOSE) printf("Signal and filters generated\n");
336 | 	}
337 | 	
338 | 	size_t filter_size_padded = nFilters*convolution_length;
339 | 	h_filters_padded = (float*)malloc(filter_size_padded*sizeof(float));
340 | 	Pad_templates(h_filters, h_filters_padded, filter_length, corrected_filter_length, convolution_length, nFilters);
341 | 	
342 | 	size_t output_size = nFilters*useful_part_size*nConvolutions;
343 | 	h_output = (float*)malloc(output_size*sizeof(float));
344 | 	
345 | 	if (VERBOSE) printf("Convolution - kFFT\n");
346 | 
347 | 	//----------------> GPU kernel
348 | 	GPU_convolution_OLS_customFFT(h_input, h_output, h_filters_padded, signal_length, convolution_length, corrected_filter_length, past_filter_samples, nFilters, nRuns, h, offset_modifier, DEVICEID, &execution_time);
349 | 	CONV_cuFFT.GPU_time = execution_time;
350 | 	if(VERBOSE) printf("     Execution time:\033[32m%0.3f\033[0mms\n", CONV_cuFFT.GPU_time);
351 | 	if(VERBOSE) {cout << "     All parameters: "; CONV_cuFFT.Print();}
352 | 	if(WRITE) CONV_cuFFT.Save();
353 | 	//----------------> GPU kernel
354 | 	
355 | 	if(CHECK){
356 | 		double total_error, mean_error;
357 | 		printf("Checking results...\n");
358 | 		Full_CONV_check(h_output, h_input, h_filters, signal_length, filter_length, past_filter_samples, useful_part_size, (filter_length>>1), convolution_length, nConvolutions, nFilters, h, &total_error, &mean_error);
359 | 		//printf("Total error: %e; Mean error: %e\n", total_error, mean_error);
360 | 	}
361 | 	
362 | 	if (input_type == 'f') {
363 | 		Write_output(h_output, useful_part_size*nConvolutions, nFilters, output_signal_file);
364 | 	}
365 | 	
366 | 	free(h_input);
367 | 	free(h_output);
368 | 	free(h_filters_padded);
369 | 	free(h_filters);
370 | 
371 | 	cudaDeviceReset();
372 | 
373 | 	if (VERBOSE) printf("Finished!\n");
374 | 
375 | 	return (0);
376 | }
377 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/Makefile:
--------------------------------------------------------------------------------
 1 | ###############################################################
 2 | # CUDA_HOME are supposed to be on default position
 3 | # and set it in your PATH .bashrc
 4 | ###############################################################
 5 | INC := -I${CUDA_HOME}/include
 6 | LIB := -L${CUDA_HOME}/lib64 -lcudart -lcufft -lcuda
 7 | 
 8 | GCC = g++
 9 | NVCC = ${CUDA_HOME}/bin/nvcc
10 | 
11 | NVCCFLAGS = -O3 -arch=sm_70 --ptxas-options=-v --use_fast_math -Xcompiler -Wextra -lineinfo
12 | 
13 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
14 | 
15 | ANALYZE = CONV.exe
16 | 
17 | 
18 | ifdef reglim
19 | NVCCFLAGS += --maxrregcount=$(reglim)
20 | endif
21 | 
22 | all: clean onefilter
23 | 
24 | onefilter: CONV_SM_OLS_R2R.o  CONV-32bit_customFFT.o Makefile
25 | 	$(NVCC) -o CONV.exe CONV_SM_OLS_R2R.o CONV-32bit_customFFT.o $(LIB) $(NVCCFLAGS)
26 | 
27 | CONV-32bit_customFFT.o: timer.h utils_cuda.h
28 | 	$(NVCC) -c CONV-32bit_customFFT.cu $(NVCCFLAGS)
29 | 
30 | CONV_SM_OLS_R2R.o: CONV_SM_OLS_R2R.cpp
31 | 	$(GCC) -c CONV_SM_OLS_R2R.cpp $(GCC_OPTS)
32 | 	
33 | clean:	
34 | 	rm -f *.o *.~ CONV.exe
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/When_cuFFT_wins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm CONV.exe;
 4 | rm *.o;
 5 | make reglim=0 > /dev/null 2>&1
 6 | for convlength in 256 512 1024 2048 4096;
 7 | do 
 8 | 	for tempsize in {64..4096..32}
 9 | 	do
10 | 		./CONV.exe r 2097152 $tempsize $convlength 32 20
11 | 	done
12 | done
13 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/benchmark_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm CONV_R2R_kFFT.dat;
4 | 
5 | ./benchmark_performance.sh;
6 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_pp_perf.dat;
7 | 
8 | ./When_cuFFT_wins.sh
9 | mv CONV_R2R_kFFT.dat OLS_SM_R2R_pp_whencuFFTwins.dat;


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/benchmark_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | rm CONV.exe;
 5 | rm *.o;
 6 | make reglim=$reg > /dev/null 2>&1
 7 | for convlength in 512 1024 2048 4096
 8 | do
 9 | 	for tempsize in 65 97 129 193 257 385 513 769 1025 2049 3073;
10 | 	do
11 | 		for templates in 2 4 8 16 32 64 96;
12 | 		do
13 | 			./CONV.exe r 262144 $tempsize $convlength $templates 20
14 | 			./CONV.exe r 524288 $tempsize $convlength $templates 20
15 | 			./CONV.exe r 1048576 $tempsize $convlength $templates 20
16 | 			./CONV.exe r 2097152 $tempsize $convlength $templates 20
17 | 			./CONV.exe r 4194304 $tempsize $convlength $templates 20
18 | 			./CONV.exe r 8388608 $tempsize $convlength $templates 20
19 | 		done
20 | 	done
21 | done
22 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/conv_check_R2R.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | double max_error = 1.0e-4;
 11 | 
 12 | 
 13 | void CPU_time_domain(float *h_input, float *h_CPU_output_timedomain, float *h_filters, int signal_length, int filter_length, int past_filter_samples, int nFilters){
 14 | 	for(int f=0; f<nFilters; f++){
 15 | 		printf(".");  fflush(stdout);
 16 | 		for(int s=0; s<signal_length; s++){
 17 | 			float ac;
 18 | 			ac = 0;
 19 | 			for(int i=0; i<filter_length; i++){
 20 | 				int filter_pos = filter_length - 1 - i;
 21 | 				float fv, sv;
 22 | 				fv = h_filters[f*filter_length + filter_pos];
 23 | 				int signal_pos = (s + i - past_filter_samples);
 24 | 				if(signal_pos>=0 && signal_pos<signal_length) sv = h_input[signal_pos];
 25 | 				else {sv = 0;}
 26 | 				ac = ac + sv*fv;
 27 | 			}
 28 | 			h_CPU_output_timedomain[f*(signal_length + filter_length - 1) + s] = ac;
 29 | 		}
 30 | 	}
 31 | 	printf("\n");
 32 | }
 33 | 
 34 | void CPU_postprocess(float *h_CPU_postprocessed, float *h_CPU_output_reduced, int nTimesamples, int nFilters, float h){
 35 | 	float left, right, result;
 36 | 	
 37 | 	for(int f=0; f<nFilters; f++){
 38 | 		for(int s=0; s<nTimesamples-1; s++){
 39 | 			int pos = f*nTimesamples + s;
 40 | 			if( s==0 ) {
 41 | 				left = h_CPU_output_reduced[pos];
 42 | 			}
 43 | 			else {
 44 | 				left = h_CPU_output_reduced[pos-1];
 45 | 			}
 46 | 			
 47 | 			if( s>=(nTimesamples-1) ) {
 48 | 				right = h_CPU_output_reduced[f*nTimesamples + nTimesamples - 1];
 49 | 			}
 50 | 			else {
 51 | 				right = h_CPU_output_reduced[pos+1];
 52 | 			}
 53 | 			
 54 | 			result = (left - right)/(2.0*h);
 55 | 			h_CPU_postprocessed[pos] = result;
 56 | 		}
 57 | 	}
 58 | }
 59 | 
 60 | float get_error(float A, float B){
 61 | 	float error, div_error=10000, per_error=10000, order=0;
 62 | 	int power;
 63 | 	if(A<0) A = -A;
 64 | 	if(B<0) B = -B;
 65 | 	
 66 | 	if (A>B) {
 67 | 		div_error = A-B;
 68 | 		if(B>10){
 69 | 			power = (int) log10(B);
 70 | 			order = pow(10,power);
 71 | 			div_error = div_error/order;
 72 | 		}
 73 | 	}
 74 | 	else {
 75 | 		div_error = B-A;
 76 | 		if(A>10){
 77 | 			power = (int) log10(A);
 78 | 			order = pow(10,power);
 79 | 			div_error = div_error/order;
 80 | 		}
 81 | 	}
 82 | 	
 83 | 	if(div_error<per_error) error = div_error;
 84 | 	else error = per_error;
 85 | 	return(error);
 86 | }
 87 | 
 88 | int Compare_data(float *CPU_result, float *GPU_result, int dim_x, int dim_y, int signal_length, int useful_part_size, double *total_error, double *mean_error){
 89 | 	double total_error_l = 0, mean_error_l = 0;
 90 | 	size_t nErrors = 0;
 91 | 	int cislo = 0;
 92 | 	float error;
 93 | 	
 94 | 	for(int y=0; y<dim_y; y++){
 95 | 		for(int x=0; x<signal_length; x++){
 96 | 			int pos = y*dim_x + x;
 97 | 			error = get_error(CPU_result[pos], GPU_result[pos]);
 98 | 			total_error_l = total_error_l + error;
 99 | 			if( error > max_error ){
100 | 				nErrors++;
101 | 				if(cislo<40){
102 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d\n", error, CPU_result[pos], GPU_result[pos], x, y, (int) (x/useful_part_size));
103 | 					cislo++;
104 | 				}
105 | 			}
106 | 		}
107 | 	}
108 | 	mean_error_l = total_error_l/(((double) dim_x)*((double) dim_y));
109 | 	(*total_error) = total_error_l;
110 | 	(*mean_error) = mean_error_l;
111 | 	return(nErrors);
112 | }
113 | 
114 | int Compare_data(float *CPU_result, float *GPU_result, float CPU_scale, float GPU_scale, int CPU_offset, int GPU_offset, int CPU_dim_x, int GPU_dim_x, int dim_y, int nSamples, int useful_part_size, double *total_error, double *mean_error){
115 | 	double total_error_l = 0, mean_error_l = 0;
116 | 	size_t nErrors = 0;
117 | 	int cislo = 0;
118 | 	float error;
119 | 	
120 | 	for(int y=0; y<dim_y; y++){
121 | 		for(int x=0; x<nSamples; x++){
122 | 			int CPU_pos = y*CPU_dim_x + x + CPU_offset;
123 | 			int GPU_pos = y*GPU_dim_x + x + GPU_offset;
124 | 			float CPU, GPU;
125 | 			CPU = CPU_result[CPU_pos]/CPU_scale;
126 | 			GPU = GPU_result[GPU_pos]/GPU_scale;
127 | 			
128 | 			
129 | 			error = get_error(CPU, GPU);
130 | 			total_error_l = total_error_l + error;
131 | 			if( error > max_error ){
132 | 				nErrors++;
133 | 				if(cislo<40){
134 | 					printf("Error [%f] CPU [%f] GPU [%f] x=%d; y=%d segment=%d; s.x=%d\n", error, CPU, GPU, x, y, (int) (x/useful_part_size), x%useful_part_size);
135 | 					cislo++;
136 | 				}
137 | 			}
138 | 		}
139 | 	}
140 | 	mean_error_l = total_error_l/(((double) nSamples)*((double) dim_y));
141 | 	(*total_error) = total_error_l;
142 | 	(*mean_error) = mean_error_l;
143 | 	return(nErrors);
144 | }
145 | 
146 | 
147 | void Full_CONV_check(float *GPU_result, float *h_input_real, float *h_filters, int signal_length, int filter_length, int past_filter_samples,  int useful_part_size, int offset, int conv_length, int nConvolutions, int nFilters, float h, double *cumulative_error, double *mean_error){
148 | 	float GPU_scale, CPU_scale;
149 | 	int CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nSamples;
150 | 	
151 | 	
152 | 	//----------------------- CPU time-domain
153 | 	size_t output_size_timedomain = (signal_length + filter_length - 1)*nFilters;
154 | 	float *h_CPU_output_timedomain;
155 | 	float *h_CPU_postprocessed;
156 | 	h_CPU_output_timedomain = (float *)malloc(output_size_timedomain*sizeof(float));
157 | 	h_CPU_postprocessed     = (float *)malloc(output_size_timedomain*sizeof(float));
158 | 	memset(h_CPU_output_timedomain, 0.0, output_size_timedomain*sizeof(float));
159 | 	memset(h_CPU_postprocessed, 0.0, output_size_timedomain*sizeof(float));
160 | 	
161 | 	printf("\n--> Time-domain convolution:");
162 | 	CPU_time_domain(h_input_real, h_CPU_output_timedomain, h_filters, signal_length, filter_length, past_filter_samples, nFilters);
163 | 	
164 | 	printf("\n--> Post-processing:\n");
165 | 	CPU_postprocess(h_CPU_postprocessed, h_CPU_output_timedomain, (signal_length + filter_length - 1), nFilters, h);
166 | 
167 | 	#ifdef POST_PROCESS
168 | 	
169 | 	printf("\n--> Comparison to CPU time-domain with post-processing:\n");
170 | 	GPU_scale = conv_length/2;
171 | 	GPU_scale = 1.0;
172 | 	CPU_scale = 1.0;
173 | 	GPU_offset = 0;
174 | 	CPU_offset = 0;
175 | 	GPU_dim_x = nConvolutions*useful_part_size;
176 | 	CPU_dim_x = (signal_length + filter_length - 1);
177 | 	nSamples = signal_length - offset;	
178 | 	Compare_data(h_CPU_postprocessed, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
179 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
180 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
181 | 	else printf("FAILED\n");
182 | 	
183 | 	#else
184 | 		
185 | 	printf("\n--> Comparison to CPU time-domain:\n");
186 | 	GPU_scale = conv_length/2;
187 | 	GPU_scale = 1.0;
188 | 	CPU_scale = 1.0;
189 | 	GPU_offset = 0;
190 | 	CPU_offset = 0;
191 | 	GPU_dim_x = nConvolutions*useful_part_size;
192 | 	CPU_dim_x = (signal_length + filter_length - 1);
193 | 	nSamples = signal_length;	
194 | 	Compare_data(h_CPU_output_timedomain, GPU_result, CPU_scale, GPU_scale, CPU_offset, GPU_offset, CPU_dim_x, GPU_dim_x, nFilters, nSamples, useful_part_size, cumulative_error, mean_error);
195 | 	//printf("----> Total error: %e; Mean error: %e\n", (double) *cumulative_error, (double) *mean_error);
196 | 	if((*mean_error)<1.0e-4) printf("PASSED\n");
197 | 	else printf("FAILED\n");
198 | 	
199 | 	#endif
200 | 	
201 | 	free(h_CPU_output_timedomain);
202 | 	free(h_CPU_postprocessed);
203 | 	//-------------------------------------------------<
204 | 	
205 | 	
206 | 	
207 | 	
208 | 	
209 | }
210 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/debug.h:
--------------------------------------------------------------------------------
 1 | #define VERBOSE true
 2 | #define DEBUG false
 3 | #define CHECK true
 4 | #define WRITE true
 5 | 
 6 | #define DEVICEID 0
 7 | #define POST_PROCESS
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/params.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAdamek/GPU_Overlap-and-save_convolution/f2ee0e3323a3e3f6f1bb96e864ad7fce5c9234ec/GPU_OLS_R2R_sharedmemory_pp/params.h


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/results.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <iomanip>
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class Performance_results{
 9 | public:
10 | 	double GPU_time;
11 | 	int nTimesamples;
12 | 	int template_length;
13 | 	int nTemplates;
14 | 	int nRuns;
15 | 	int reglim;
16 | 	int OaS_conv_size;
17 | 	int templates_per_block;
18 | 	char filename[200];
19 | 	char kernel[10];
20 | 	
21 | 	Performance_results() {
22 | 		GPU_time=0;
23 | 	}
24 | 	
25 | 	void Save(){
26 | 		ofstream FILEOUT;
27 | 		FILEOUT.open (filename, std::ofstream::out | std::ofstream::app);
28 | 		FILEOUT << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
29 | 		FILEOUT.close();
30 | 	}
31 | 	
32 | 	void Print(){
33 | 		cout << std::fixed << std::setprecision(8) << nTimesamples << " " << template_length << " " << nTemplates << " " << GPU_time << " " << nRuns << " " << reglim << " " << OaS_conv_size << " " << templates_per_block << " " << kernel << endl;
34 | 	}
35 | 	
36 | 	void Assign(int t_nTimesamples, int t_template_length, int t_nTemplates, int t_nRuns, int t_reglim, int t_OaS_conv_size, int t_templates_per_block, char const *t_filename, char const *t_kernel){
37 | 		nTimesamples        = t_nTimesamples;
38 | 		template_length     = t_template_length;
39 | 		nTemplates          = t_nTemplates;
40 | 		nRuns               = t_nRuns;
41 | 		reglim              = t_reglim;
42 | 		OaS_conv_size       = t_OaS_conv_size;
43 | 		templates_per_block = t_templates_per_block;
44 | 		sprintf(filename,"%s", t_filename);
45 | 		sprintf(kernel,"%s",t_kernel);
46 | 	}
47 | 	
48 | };
49 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/run_convolution.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --job-name=AAFFT_R2R_conv
 6 | #SBATCH --partition=htc
 7 | #SBATCH --gres=gpu:1 --constraint='gpu_sku:P100'
 8 | 
 9 | module load gpu/cuda/10.0.130
10 | 
11 | ./benchmark_all.sh
12 | 
13 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/GPU_OLS_R2R_sharedmemory_pp/utils_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Karel Adámek
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/OLS_generate_files/Example_files.cpp:
--------------------------------------------------------------------------------
  1 | //********************************************************************************************
  2 | //* This is GPU implementation of a Overlap-and-save method for calculating convolution. 
  3 | //* Copyright (C) 2017  Adámek Karel
  4 | //* 
  5 | //* Authors: Karel Adamek ( ORCID:0000-0003-2797-0595; https://github.com/KAdamek ), Wesley Armour ( ORCID:0000-0003-1756-3064 ), Sofia Dimoudi 
  6 | //********************************************************************************************
  7 | 
  8 | 
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | #include <math.h>
 12 | #include <time.h>
 13 | #include <stdlib.h>
 14 | #include <iostream>
 15 | #include <fstream>
 16 | #include <iomanip> 
 17 | 
 18 | struct float2 {
 19 | 	float x;
 20 | 	float y;
 21 | };
 22 | 
 23 | void Generate_signal(float2 *h_input, int nTimesamples){
 24 | 	for(int f=0; f<nTimesamples; f++){
 25 | 		h_input[f].y=rand() / (float)RAND_MAX;
 26 | 		h_input[f].x=rand() / (float)RAND_MAX;
 27 | 	}
 28 | 	
 29 | 	for(int f=15000; f<nTimesamples; f++){
 30 | 		h_input[f].x = (f%4096)/500.0;
 31 | 	}
 32 | 	
 33 | 	for(int f=0; f<192; f++){
 34 | 		h_input[f + 5300].x = 10.0;
 35 | 	}
 36 | 	
 37 | 	for(int f=0; f<128; f++){
 38 | 		h_input[f + 8626].x = 10.0;
 39 | 	}
 40 | 	
 41 | 	for(int f=0; f<36; f++){
 42 | 		h_input[f + 9626].x = 10.0;
 43 | 	}
 44 | 	
 45 | 	for(int f=0; f<83; f++){
 46 | 		h_input[f + 10626].x = 10.0;
 47 | 	}
 48 | 	
 49 | 	for(int f=0; f<138; f++){
 50 | 		h_input[f + 11626].x = 10.0;
 51 | 	}
 52 | 	
 53 | }
 54 | 
 55 | void Generate_templates(float2 *h_templates, int nSamples, int nTemplates){
 56 | 	int boxcar_width, itemp;
 57 | 	itemp = (((float) nSamples)*0.8)/((float) nTemplates);
 58 | 	if (itemp==0) itemp++;
 59 | 	for(int t=0; t<nTemplates; t++){
 60 | 		boxcar_width = ((t+1)*itemp);
 61 | 		if(boxcar_width>nSamples) boxcar_width=nSamples;
 62 | 		for(int f=0; f<nSamples; f++){
 63 | 			if( f>=(nSamples/2-boxcar_width/2) && f<( nSamples/2+boxcar_width/2) ){
 64 | 				h_templates[t*nSamples + f].x=1;
 65 | 				h_templates[t*nSamples + f].y=0;
 66 | 			}
 67 | 			else {
 68 | 				h_templates[t*nSamples + f].x=0;
 69 | 				h_templates[t*nSamples + f].y=0;
 70 | 			}
 71 | 		}
 72 | 	}
 73 | }
 74 | 
 75 | 
 76 | 
 77 | int GPU_CONV(float2 *h_input_signal, float2 *h_output_plane_reduced, float2 *h_templates, int useful_part_size, int offset, int template_length, int nConvolutions, int nTemplates, int nRuns, double *execution_time);
 78 | int GPU_CONV_debug(float2 *h_input_signal, float2 *h_GPU_input_signal_extended, float2 *h_GPU_input_signal_extended_FFT, float2 *h_output_plane, float2 *h_output_plane_IFFT, float2 *h_output_plane_reduced, float2 *h_templates, int useful_part_size, int offset, int template_length, int nConvolutions, int nTemplates, int nRuns, double *execution_time);
 79 | 
 80 | 
 81 | int main(int argc, char* argv[]) {
 82 | 	int nTimesamples;
 83 | 	int template_length;
 84 | 	int nTemplates;
 85 | 	char filter_file[100];
 86 | 	char signal_file[100];
 87 | 
 88 | 	char * pEnd;
 89 | 	if (argc==6) {
 90 | 		nTimesamples    = strtol(argv[1],&pEnd,10);
 91 | 		template_length = strtol(argv[2],&pEnd,10);
 92 | 		nTemplates      = strtol(argv[3],&pEnd,10);
 93 | 		if (strlen(argv[4])>100) {printf("Filename of input signal file is too long\n"); exit(2);}
 94 | 		sprintf(signal_file,"%s",argv[4]);
 95 | 		if (strlen(argv[5])>100) {printf("Filename of input filter file is too long\n"); exit(2);}
 96 | 		sprintf(filter_file,"%s",argv[5]);
 97 | 	}
 98 | 	else {
 99 | 		printf("Argument error!\n");
100 | 		printf(" 1) Signal length in number of time samples (min 15000 samples)\n");
101 | 		printf(" 2) Filter length Example:129\n");
102 | 		printf(" 3) Number of filters\n");
103 | 		printf(" 4) Name of the file to export signal to\n");
104 | 		printf(" 5) Name of the file to export filters to\n");
105 |         return 1;
106 | 	}
107 | 	
108 | 	if (nTimesamples<15000) {printf("Number of samples must be higher then 15000 samples\n"); exit(1);}
109 | 	
110 | 	size_t input_size            = nTimesamples;
111 | 	size_t template_size_time    = nTemplates*template_length;
112 | 
113 | 	float2 *h_input_signal;
114 | 	float2 *h_templates;
115 | 
116 | 	h_input_signal = (float2 *)malloc(input_size*sizeof(float2));
117 | 	h_templates    = (float2 *)malloc(template_size_time*sizeof(float2));
118 | 
119 | 	memset(h_input_signal, 0.0, input_size*sizeof(float2));
120 | 	memset(h_templates, 0.0, template_size_time*sizeof(float2));
121 | 
122 | 	Generate_signal(h_input_signal, nTimesamples);
123 | 	Generate_templates(h_templates, template_length, nTemplates);
124 | 	
125 | 	
126 | 	std::ofstream FILEOUT;
127 | 	FILEOUT.open(signal_file);
128 | 	for(int ts=0; ts<nTimesamples; ts++){
129 | 		FILEOUT << h_input_signal[ts].x << " " << h_input_signal[ts].y << std::endl;
130 | 	}
131 | 	FILEOUT.close();
132 | 	
133 | 	FILEOUT.open(filter_file);
134 | 	for(int f=0; f<nTemplates; f++){
135 | 		for(int ts=0; ts<template_length; ts++){
136 | 			FILEOUT << h_templates[f*template_length + ts].x << " " << h_templates[f*template_length + ts].y << std::endl;
137 | 		}
138 | 	}
139 | 	FILEOUT.close();
140 | 	
141 | 	free(h_input_signal);
142 | 	free(h_templates);
143 | 
144 | 	return (0);
145 | }
146 | 


--------------------------------------------------------------------------------
/OLS_generate_files/Makefile:
--------------------------------------------------------------------------------
 1 | GCC = g++
 2 | 
 3 | GCC_OPTS =-O3 -Wall -Wextra $(INC)
 4 | 
 5 | ANALYZE = Example_files.exe
 6 | 
 7 | all: clean analyze
 8 | 
 9 | analyze: Makefile
10 | 	$(GCC) -o $(ANALYZE) Example_files.cpp $(GCC_OPTS)
11 | 
12 | clean:	
13 | 	rm -f *.o *.~ $(ANALYZE)
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | README v0.1 / 1 OCTOBER 2019
  2 | 27 JULY 2020 - convolutions with shared memory FFT can now process both odd and even length filters with arbitrary number of past and present samples not only odd time centred filters as before
  3 | 25 AUGUST 2020 - convolved signal is now multiplied by the normalization constant equal to the size of the FFT
  4 | 
  5 | ### Publication
  6 | If you find these codes useful please cite:
  7 | https://dl.acm.org/doi/10.1145/3394116
  8 |   
  9 | # Convolution using shared memory overlap-and-save method on NVIDIA GPUs
 10 | Overlap-and-save method of calculation linear one-dimensional convolution on NVIDIA GPUs using shared memory. Out implementation of the overlap-and-save method uses shared memory implementation of the FFT algorithm to increase performance of one-dimensional complex-to-complex or real-to-real convolutions. The speed-up achieved depends on the filter length up to 2.5x faster for filter length 257 samples for complex-to-complex (C2C) and up to 4x for real-to-real (R2R) convolution.
 11 | 
 12 | Note: This implementation of OLS convolution uses modified version of GPU shared memory FFT which does not reorder elements of the output. For GPU shared memory FFT with reordering step which please go to [SMFFT repository](https://github.com/KAdamek/SMFFT)
 13 | 
 14 | ## Introduction
 15 | Convolution is a standard tool in signal processing. It is a linear operation where a signal s is modified by a filter (response function) r. There are two fundamental ways how to calculate convolution. We can calculate convolution in time-domain using formulae for discrete convolution (more in any signal processing literature for example R. G. Lyons, Understanding digital signal processing 3rd ed, Prentice 180 Hall, 2011). We can also invoke convolution theorem and calculate convolution in frequency domain. To do this we need to perform discrete Fourier transformation first, then perform convolution in frequency-domain and finally use inverse discrete Fourier transformation to return to time-domain. The overlap-and-save (OLS) method (more in Press et.al. Numerical Recipes) of calculating convolution is designed for special case where we have very long input signal, but relatively short filter. The disadvantage of Fourier-domain convolution method in cases such as this is that the convolution theorem demands that both the signal and the filter in Fourier-domain must be of the same length. This can be performance prohibitive if we are dealing with very long signals and multiple short filters. The overlap-and-save method separates the input signal into smaller segments which are then independently processed, which makes this method ideal for parallel processing for example on GPUs. At the end the overlap-and-save method add all these segments together in such a way as to produce linear convolution.
 16 | 
 17 | We provide two implementations of overlap-and-save method, first is using vendor provided FFT library the NVIDIA cuFFT library (cuFFT-OSL) for calculating necessary FFTs, the second implementation is using our shared memory implementation of the FFT algorithm and performs overlap-and-save method in shared memory (SM-OLS) without accessing the device memory. The advantage of having a shared memory FFT algorithm is that we can perform all necessary steps of the overlap-and-save inside one CUDA kernel thus saving costly device memory transactions thus providing significant speedup over cuFFT implementation. 
 18 | 
 19 | This technique was used in GPU implementation of the Fourier domain acceleration search for time-domain radio astronomy and it is a part of AstroAccelerate (https://github.com/AstroAccelerateOrg/astro-accelerate).
 20 | 
 21 | Current version of the code aims to demonstrate performance gain of the shared memory implementation and for performance testing. 
 22 | 
 23 | ## Usage
 24 | In total there are eight different implementations of the overlap-and-save (OLS) method. There is OLS which uses NVIDIA cuFFT library (cuFFT-OLS) and shared memory implementation of the OLS method (SM-OLS) which uses shared memory implementation of the FFT algorithm. Both of these are for one-dimensional complex-to-complex or real-to-real convolutions. Each implementation has also version with non-local post-processing in for of numerical differentiation (distinguished by "_pp" in the directory name). 
 25 | There are two modes of operation, one for performance testing (first argument 'r') which does not require user to provide the input data and one for processing user data (first argument 'f'). The command line arguments are slightly different between cuFFT-OLS and SM-OLS.
 26 | 
 27 | ### Convolutions using NVIDIA cuFFT library
 28 | The arguments expected by the cuFFT-OLS depends on chosen mode of operation. 
 29 |  1) Input type: 'r' or 'f'
 30 | 
 31 | Parameters if input type is 'f' - file input provided by user
 32 |  2) Input signal file
 33 |  3) Input filter file
 34 |  4) Output signal file
 35 |  5) number of filters
 36 |  Example: CONV.exe f signal.dat filter.dat output.dat 32
 37 | 
 38 | Parameters if input type is 'r' - random input generated by the code
 39 |  2) Signal length in number of time samples
 40 |  3) Filter length in samples
 41 |  4) Number of templates
 42 |  5) number of GPU kernel runs
 43 |  Example: CONV.exe r 2097152 193 32 10
 44 |  
 45 | The cuFFT-OLS implementation expects that CONV_SIZE would be #defined in 'params.h'. This constant determines size of the segment which will be processed, best performing value is in the most cases 8192. 
 46 | 
 47 | ### Convolutions using shared memory FFT
 48 | The command line arguments are slightly different for SM-OLS. In addition to the same parameters from cuFFT-OLS it also requires the user to give the segment size (FFT size):
 49 |  1) Input type: 'r' or 'f'
 50 | ----------------------------------
 51 | Parameters if input type is 'f' - file input provided by user
 52 |  2) Input signal file
 53 |  3) Input filter file
 54 |  4) Output signal file
 55 |  5) Convolution length in samples
 56 |  6) number of filters
 57 |  7) number of past samples in the filter.
 58 |     for past filter (causal) it is (filter_length - 1)
 59 |     for odd centred filter it is floor(filter_length/2)
 60 |     for future filter it is 0
 61 |  Example: CONV.exe f signal.dat filter.dat output.dat 2048 32 192
 62 | ----------------------------------
 63 | Parameters if input type is 'r' - random input generated by the code
 64 |  2) Signal length in number of time samples
 65 |  3) Filter length in samples
 66 |  4) number of past samples in the filter.
 67 |     for past filter (causal) it is (filter_length - 1)
 68 |     for odd centred filter it is floor(filter_length/2)
 69 |     for future filter it is 0
 70 |  5) Convolution length in samples
 71 |  6) Number of filters
 72 |  7) number of GPU kernel runs
 73 |  Example: CONV.exe r 2097152 193 192 2048 32 10
 74 | 
 75 | 		
 76 | In case of SM-FFT implementation there is no universal segment size. The maximum segment size is 4096. 
 77 | 
 78 | Beware when using very long signals, for example signal with 2^23 time samples with 100 filter will take 6.4GB of memory.
 79 | 
 80 | If enabled in debug.h the code will write out timing and parameters of given run for analysis of the results. The columns of the output file are:
 81 | 1) number of time samples
 82 | 2) filter length 
 83 | 3) number of filters
 84 | 4) average execution time of convolution kernel (no transfer time from/to host are included)
 85 | 5) number of executions from which the execution time is calculated
 86 | 6) limitation on number of registers if set
 87 | 7) CONV_SIZE
 88 | 8) number of filters processed per CUDA thread block
 89 | 9) type of the kernel used
 90 | 
 91 | 
 92 | 
 93 | ## Generating example files
 94 | The example files could be generated by using 'GPU_OaS_generate_files'. The code for generating example files expects following arguments:
 95 | 1) Signal length in number of time samples (min 15000 samples)
 96 | 2) Filter length Example:129 (odd because we have assumed centered filter)
 97 | 3) Number of filters
 98 | 4) Name of the file to export signal to
 99 | 5) Name of the file to export filters to
100 | For example:
101 | Example_files.exe 50000 129 32 signal.dat filter.dat
102 | 
103 | Structure of input and output files
104 | In the input file (signal or filter), each time complex sample is written on individual line where real part is in the first column and imaginary part is in second column. Filters are written one after another without any additional lines. Input for the real-to-real convolution is the same, with the difference that for the input signal is used the power sqrt(real^2+imaginary^) and for the filter the imaginary part is ignored.
105 | 
106 | The structure of the output file is divided into blocks by filters and they are separated by empty lines. The column in output file are as follows:
107 | 1) filter index 0<=i<(number of filters)
108 | 2) time sample
109 | 3) real part of convolved signal
110 | 4) imaginary part of convolved signal (only for complex-to-complex convolutions)
111 | 
112 | in gnuplot one can display powers of complex-to-complex convolved signal as: splot 'output.dat' using 1:2:($3*$3+$4*$4) palette 
113 | for three-dimensional plot. 
114 | Results for one particular filter can be displayed using
115 | plot 'output.dat' using ($1==X?$2:1/0):($3*$3+$4*$4) w lines
116 | where X is the number of the filter starting with zero.
117 | 
118 | Be advised that the code might produce very big files for long signals and large number of filters.
119 | 
120 | For real-to-real the output has only real component thus one can use "using ...:3" instead of "using ...:($3*$3+$4*$4)".
121 | 
122 | 
123 | 
124 | 
125 | ## Installation
126 | 
127 | ### Requirements
128 | 
129 | NVIDIA GPU and CUDA Toolkit
130 | 
131 | ### Installation
132 | We have provided a make file which should take care of the compilation step. The make file assumes that environmental variable 'CUDA_HOME' is set and that it points to a folder containing installation of CUDA Toolkit. The compute capability also needs to be set in the make file. You can change the architecture by using a flag -arch=X where X is compute capability. For TitanV from Volta generation that is -arch=sm_70.
133 | 
134 | The code does not require any other dependencies.
135 | 
136 | 
137 | ### Configuration
138 | 
139 | The code behavoiur could be changed by editing debug.h which is located in each of the directories. The debug.h contains the following options:
140 | VERBOSE enables more output verbose output to console
141 | DEBUG displays debugging information
142 | CHECK enables test which checks the output of the overlap-and-save convolution with timedomain convolution
143 | WRITE enables writeing of the execution time and other parameters into a file
144 | 
145 | DEVICEID 0 with this you can set the id of the device which should be used for during code execution
146 | POST_PROCESS a flag which enables non-local used in for the article. If commented out code will perform normal convolution without post-processing.
147 | 
148 | ### Additional files
149 | Together with the code we also provide scripts for benchmarking and R script for processing resulting data.
150 | 
151 | ## Future work
152 | 	We would like to improve the code and create proper library which could be used directly without modification.
153 | 
154 | ## Contributors
155 | 	Karel Adamek
156 | 	Sofia Dimoudi
157 | 	Wes Armour
158 | 	Mike Giles
159 | 
160 | ## Contact
161 | 	You can contact me using my email karel.adamek@gmail.com
162 | 
163 | ## License
164 | 
165 | This project is licensed under [insert license]. The license should be in a separate file called LICENSE, so don't explain it in detail within your documentation. Also, don't forget to specify licenses of third-party libraries and programs you use.
166 | 
167 | Sometimes including a Table of Contents (TOC) at the beginning of the documentation makes sense, especially when your README file is more than a few paragraphs. If you think that the README file has grown too large, put some of the more detailed parts, such as installation or configuration sections, into their own files.
168 | 
169 | 


--------------------------------------------------------------------------------
/process_AAFFT_results.R:
--------------------------------------------------------------------------------
  1 | output_extension=".txt";
  2 | inpath="";
  3 | outpath="";
  4 | AAFFT_file = "Conv_CUDA10.0_TitanV_OLS_R2R_customFFT_perf.dat";
  5 | cuFFT_file = "Conv_CUDA10.0_TitanV_OLS_R2R_cuFFT_callbacks_mk2_perf.dat";
  6 | 
  7 | newline <- ""
  8 | 
  9 | #Flags
 10 | export_best_performance = 0;
 11 | export_data_grouped_by_template_width = 0;
 12 | export_nTemplates_time_speedup_gr_template_width_one_file = 1;
 13 | export_signal_length_time_speedup_gr_nTemplates_one_file = 1;
 14 | 
 15 | #Read data and find dimensions of it
 16 | AAFFTdata = read.table(paste(inpath, AAFFT_file, sep=""));
 17 | cuFFTdata = read.table(paste(inpath, cuFFT_file, sep=""));
 18 | 
 19 | alldata <- rbind(AAFFTdata, cuFFTdata);
 20 | template_lengths    = unique(alldata[[2]], incomparables = FALSE);
 21 | number_of_templates = unique(alldata[[3]], incomparables = FALSE);
 22 | signal_lengths      = unique(alldata[[1]], incomparables = FALSE);
 23 | rm(alldata);
 24 | 
 25 | 
 26 | #-------------------------------------------------
 27 | #Process AAFFT performance data
 28 | bestAAFFTperformance <- AAFFTdata[1,]; bestAAFFTperformance <- bestAAFFTperformance[-1,];
 29 | 
 30 | #Find best performing configuration for each case (template width, convolution size, ...)
 31 | for (nLenght in template_lengths){
 32 |   templengthdata <- AAFFTdata[(AAFFTdata[[2]]==nLenght),];
 33 |   for (nTemplates in number_of_templates){
 34 |     templatedata <- templengthdata[(templengthdata[[3]]==nTemplates),];
 35 |     for (slength in signal_lengths){
 36 |       slengthdata<-templatedata[(templatedata[[1]]==slength),];
 37 |       if (length(slengthdata[,1]) > 0){
 38 |         best_line <- which.min(slengthdata[[4]]);
 39 |         bestAAFFTperformance<-rbind(bestAAFFTperformance,slengthdata[best_line,]);
 40 |       }
 41 |     }
 42 |   }
 43 | }
 44 | #rm(AAFFTdata);
 45 | #--------------------------------------------------<
 46 | 
 47 | #-------------------------------------------------
 48 | #Process cuFFT performance data
 49 | bestcuFFTperformance <- cuFFTdata[1,]; bestcuFFTperformance <- bestcuFFTperformance[-1,];
 50 | 
 51 | #Find best performing configuration for each case (template width, convolution size, ...)
 52 | for (nLenght in template_lengths){
 53 |   templengthdata <- cuFFTdata[(cuFFTdata[[2]]==nLenght),];
 54 |   for (nTemplates in number_of_templates){
 55 |     templatedata <- templengthdata[(templengthdata[[3]]==nTemplates),];
 56 |     for (slength in signal_lengths){
 57 |       slengthdata<-templatedata[(templatedata[[1]]==slength),];
 58 |       if (length(slengthdata[,1]) > 0){
 59 |         best_line <- which.min(slengthdata[[4]]);
 60 |         bestcuFFTperformance<-rbind(bestcuFFTperformance,slengthdata[best_line,]);
 61 |       }
 62 |     }
 63 |   }
 64 | }
 65 | #rm(cuFFTdata);
 66 | #--------------------------------------------------<
 67 | 
 68 | 
 69 | #writing best performance grouped by filter length
 70 | if(export_best_performance==1){
 71 |   for (nLenght in template_lengths){
 72 |     templengthdata<-bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),];
 73 |     for (slength in signal_lengths){
 74 |       slengthdata<-templengthdata[(templengthdata[[1]]==slength),];
 75 |       filename="Best_kFFT";
 76 |       filename<-paste(filename, slength, nLenght, sep="_");
 77 |       filename<-paste(filename, output_extension, sep="");
 78 |       unlink(filename);
 79 |       write.table(slengthdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE);
 80 |     }
 81 |   }
 82 |   
 83 |   #writing out the results
 84 |   extension=".txt";
 85 |   for (nLenght in template_lengths){
 86 |     templengthdata<-bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),];
 87 |     for (slength in signal_lengths){
 88 |       slengthdata<-templengthdata[(templengthdata[[1]]==slength),];
 89 |       filename="Best_cuFFT";
 90 |       filename<-paste(filename, slength, nLenght, sep="_");
 91 |       filename<-paste(filename, output_extension, sep="");
 92 |       unlink(filename);
 93 |       write.table(slengthdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE);
 94 |     }
 95 |   }
 96 | }
 97 | 
 98 | 
 99 | #export data grouped by template width
100 | if(export_data_grouped_by_template_width==1){
101 |   for (nLenght in template_lengths){
102 |     AAFFTdata_fixedTemplate <- bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),];
103 |     #set up container
104 |     AAFFTdata_temp<-AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[1]]==signal_lengths[[1]]),];
105 |     resultdata<-cbind(AAFFTdata_temp[[3]]);
106 |     #add columns with signal length and time for each signal length
107 |     for (slength in signal_lengths){
108 |       AAFFTdata_temp<-templengthdata[(templengthdata[[1]]==slength),];
109 | 	  if (length(AAFFTdata_temp[,1]) > 0){
110 |         resultdata<-cbind(resultdata, AAFFTdata_temp[[1]], AAFFTdata_temp[[4]]);
111 |       }
112 |     }
113 |     #export data
114 |     filename="AAFFT_results";
115 |     filename<-paste(filename, nLenght, sep="_");
116 |     filename<-paste(filename, output_extension, sep="");
117 |     unlink(filename);
118 |     write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE);
119 |   }
120 | 
121 |   for (nLenght in template_lengths){
122 |     cuFFTdata_fixedTemplate <- bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),];
123 |     #set up container
124 |     cuFFTdata_temp<-cuFFTdata_fixedTemplate[(cuFFTdata_fixedTemplate[[1]]==signal_lengths[[1]]),];
125 |     resultdata<-cbind(cuFFTdata_temp[[3]]);
126 |     #add columns with signal length and time for each signal length
127 |     for (slength in signal_lengths){
128 |       cuFFTdata_temp<-templengthdata[(templengthdata[[1]]==slength),];
129 | 	  if (length(cuFFTdata_temp[,1]) > 0){
130 |         resultdata<-cbind(resultdata, cuFFTdata_temp[[1]], cuFFTdata_temp[[4]]);
131 |       }
132 |     }
133 |     #export data
134 |     filename="cuFFT_results";
135 |     filename<-paste(filename, nLenght, sep="_");
136 |     filename<-paste(filename, output_extension, sep="");
137 |     unlink(filename);
138 |     write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE);
139 |   }
140 | }
141 | 
142 | 
143 | #export data time and speedup vs nTemplates grouped by template width, both AAFFT and cuFFT in one file
144 | if(export_nTemplates_time_speedup_gr_template_width_one_file==1){
145 |   for (nLenght in template_lengths){
146 |     AAFFTdata_fixedTemplate <- bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),];
147 |     cuFFTdata_fixedTemplate <- bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),];
148 |     
149 |     #set up container
150 |     AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[1]]==signal_lengths[[1]]),];
151 |     resultdata<-cbind(AAFFTdata_temp[[3]]);
152 |     #add columns with signal length and time for each signal length
153 |     for (slength in signal_lengths){
154 |       AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[1]]==slength),];
155 |       cuFFTdata_temp <- cuFFTdata_fixedTemplate[(cuFFTdata_fixedTemplate[[1]]==slength),];
156 |       #Creating dataframe with results
157 |       resultdata<-cbind(resultdata, AAFFTdata_temp[[1]], cuFFTdata_temp[[4]], AAFFTdata_temp[[4]], cuFFTdata_temp[[4]]/AAFFTdata_temp[[4]]);
158 |     }
159 |     #export data
160 |     filename="Results_TitanV_cuFFT_callbacks_R2R_speedup";
161 |     filename<-paste(filename,nLenght,sep="_");
162 |     filename<-paste(filename,output_extension,sep="");
163 |     unlink(filename);
164 |     write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE);
165 |   }
166 | }
167 | rm(resultdata)
168 | 
169 | #export data time and speedup vs signal length grouped by number of templates, both AAFFT and cuFFT in one file
170 | if(export_signal_length_time_speedup_gr_nTemplates_one_file==1){
171 | 	for (nLenght in template_lengths){
172 | 		AAFFTdata_fixedTemplate <- bestAAFFTperformance[(bestAAFFTperformance[[2]]==nLenght),];
173 | 		cuFFTdata_fixedTemplate <- bestcuFFTperformance[(bestcuFFTperformance[[2]]==nLenght),];
174 | 		
175 | 		#set up container
176 | 		AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[3]]==number_of_templates[[3]]),];
177 | 		signal_length <- cbind(AAFFTdata_temp[[1]]);
178 | 		resultdata <- cbind(AAFFTdata_temp[[1]]);
179 | 		#add columns with signal length and time for each signal length
180 | 		for (nTemplates in number_of_templates){
181 | 			temporaryresults <- signal_length
182 | 			AAFFTdata_temp <- AAFFTdata_fixedTemplate[(AAFFTdata_fixedTemplate[[3]]==nTemplates),];
183 | 			cuFFTdata_temp <- cuFFTdata_fixedTemplate[(cuFFTdata_fixedTemplate[[3]]==nTemplates),];
184 | 			#Creating dataframe with results
185 | 			temporaryresults <-merge(temporaryresults, AAFFTdata_temp, by.x='V1', by.y='V1', all.x=TRUE, all.y=FALSE)
186 | 			temporaryresults <-merge(temporaryresults, cuFFTdata_temp, by.x='V1', by.y='V1', all.x=TRUE, all.y=FALSE)
187 | 			local_results<-cbind(temporaryresults[[3]], temporaryresults[[12]], temporaryresults[[4]], temporaryresults[[12]]/temporaryresults[[4]]);
188 | 			resultdata<-cbind(resultdata, local_results);
189 | 			rm(temporaryresults);
190 | 		}
191 | 		#export data
192 | 		filename="Results_TitanV_cuFFT_callbacks_R2R_speedup_signal_length";
193 | 		filename<-paste(filename,nLenght,sep="_");
194 | 		filename<-paste(filename,output_extension,sep="");
195 | 		unlink(filename);
196 | 		write.table(resultdata, file = filename, append = FALSE, sep = " ", row.names=FALSE, col.names=FALSE, quote = FALSE);
197 | 	}
198 | }
199 | 


--------------------------------------------------------------------------------