├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── acc_malloc.c
├── cuda_main.cu
├── cuda_map.cu
├── cuf_main.cuf
├── kernels.cuf
├── openacc_c_cublas.c
├── openacc_c_cublas_v2.c
├── openacc_c_main.c
├── openacc_cublas.f90
├── openacc_cuda_device.cpp
├── openacc_main.f90
├── openacc_streams.c
├── saxpy_cuda.cu
├── saxpy_cuda_async.cu
├── saxpy_cuda_device.cu
├── saxpy_openacc_c.c
├── saxpy_openacc_c_mapped.c
├── saxpy_openacc_f.f90
└── thrust.cu


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | core
 3 | .*.swp
 4 | a.out
 5 | *.mod
 6 | *.lst
 7 | *.ptx
 8 | *.cub
 9 | cuda_profile_*.log
10 | cuf_main
11 | thrust
12 | cuf_openacc_main
13 | cuda_main
14 | openacc_cublas
15 | openacc_c_cublas
16 | openacc_c_main
17 | cuda_map
18 | openacc_streams
19 | openacc_cuda_device
20 | acc_malloc
21 | openacc_c_cublas_v2
22 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2013, Jeff Larkin
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=cuda_main openacc_c_main openacc_c_cublas openacc_c_cublas_v2 thrust cuda_map acc_malloc openacc_streams openacc_cuda_device
 2 | 
 3 | ifeq "$(PE_ENV)" "CRAY"
 4 | # Cray Compiler
 5 | CXX=CC
 6 | CXXFLAGS=-hlist=a
 7 | CC=cc
 8 | CFLAGS=-hlist=a
 9 | CUDAC=nvcc
10 | CUDAFLAGS=-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80
11 | FC=ftn
12 | FFLAGS=-ra
13 | LDFLAGS=-L$(CUDA_HOME)/lib64 -lcudart
14 | else
15 | # PGI Compiler
16 | EXES+=cuf_main cuf_openacc_main openacc_cublas  
17 | CXX=nvc++
18 | CXXFLAGS=-fast -acc -Minfo=accel -gpu=cc60,cc70,cc75,cc80
19 | CC=nvc
20 | CFLAGS=-fast -acc -Minfo=accel -gpu=cc60,cc70,cc75,cc80
21 | CUDAC=nvcc
22 | # Hard-coded architectures to avoid build issue when arches are added or
23 | # removed from compilers
24 | CUDAFLAGS=-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80
25 | FC=nvfortran
26 | FFLAGS=-fast -acc -Minfo=accel -gpu=cc60,cc70,cc75,cc80
27 | LDFLAGS=-Mcuda 
28 | endif
29 | 
30 | all: $(EXES)
31 | 
32 | openacc_cublas: openacc_cublas.o
33 | 	$(FC) -o $@ $(CFLAGS) $^ $(LDFLAGS) -Mcudalib=cublas
34 | 
35 | openacc_c_cublas: openacc_c_cublas.o
36 | 	$(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS) -Mcudalib=cublas
37 | 
38 | openacc_c_cublas_v2: openacc_c_cublas_v2.o
39 | 	$(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS) -Mcudalib=cublas
40 | 
41 | openacc_c_main: saxpy_cuda.o openacc_c_main.o
42 | 	$(CXX) -o $@ $(CFLAGS) $^ $(LDFLAGS)
43 | 
44 | cuda_main: saxpy_openacc_c.o cuda_main.o
45 | 	$(CXX) -o $@ $(CFLAGS) $^ $(LDFLAGS)
46 | 
47 | cuf_main: cuf_main.o
48 | 	$(FC) -o $@ $(FFLAGS) $^ $(LDFLAGS) -Mcuda
49 | 
50 | cuf_openacc_main: kernels.o openacc_main.o
51 | 	$(FC) -o $@ $(FFLAGS) $^ $(LDFLAGS) -Mcuda
52 | 
53 | thrust: saxpy_openacc_c.o thrust.o
54 | 	$(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS) -lstdc++
55 | 
56 | cuda_map: saxpy_openacc_c_mapped.o cuda_map.o
57 | 	$(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS)
58 | 
59 | acc_malloc: saxpy_openacc_c.o acc_malloc.o
60 | 	$(CC) -o $@ $(CXXFLAGS) $^ $(LDFLAGS)
61 | 
62 | openacc_cuda_device: saxpy_cuda_device.o openacc_cuda_device.o
63 | 	$(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS)
64 | 
65 | saxpy_cuda_device.o: saxpy_cuda_device.cu
66 | 	$(CUDAC) $(CUDAFLAGS) -rdc true -c $<
67 | 	
68 | openacc_streams: saxpy_cuda_async.o openacc_streams.o
69 | 	$(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS)
70 | 
71 | .SUFFIXES:
72 | .SUFFIXES: .c .o .f90 .cu .cpp .cuf
73 | .c.o:
74 | 	$(CC) $(CFLAGS) -c $<
75 | .cpp.o:
76 | 	$(CXX) $(CXXFLAGS) -c $<
77 | .f90.o:
78 | 	$(FC) $(FFLAGS) -c $<
79 | .cuf.o:
80 | 	$(FC) $(FFLAGS) -c $<
81 | .cu.o:
82 | 	$(CUDAC) $(CUDAFLAGS) -c $<
83 | .PHONY: clean
84 | clean:
85 | 	rm -rf *.o *.ptx *.cub *.lst *.mod $(EXES)
86 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Stupid OpenACC (Interoperability) Tricks
 2 | ========================================
 3 | Author: Jeff Larkin <jlarkin@nvidia.com>
 4 | 
 5 | This repository demonstrates interoperability between OpenACC and various other
 6 | GPU programming models. An OpenACC-enabled compiler is required. The default
 7 | makefile has been written for PGI and tested with PGI 20.9, although most if
 8 | not all examples will work with earlier versions.
 9 | 
10 | If building with the Cray Compiler Environment the makefile will detect this
11 | and adjust compiler flags and targets accordingly. Some targets rely on PGI
12 | CUDA Fortran features, these targets will be disabled when building with CCE.
13 | 
14 | Build Instructions:
15 | -------------------
16 | $ make 
17 | 
18 | Examples
19 | --------
20 | * cuda\_main - calling OpenACC from CUDA C
21 | * openacc\_c\_main - Calling CUDA from OpenACC in C
22 | * openacc\_c\_cublas - Calling CUBLAS from OpenACC in C
23 | * thrust - Mixing OpenACC and Thrust in C++
24 | * cuda\_map - Using OpenACC acc\_map\_data with CUDA in C
25 | * cuf\_main - Calling OpenACC from CUDA Fortran
26 | * cuf\_openacc\_main - Calling CUDA Fortran from OpenACC
27 | * openacc\_cublas - Calling CUBLAS from OpenACC in CUDA Fortran
28 | * acc\_malloc - Same as cuda\_main, but using the OpenACC API
29 | * openacc\_streams  - Mixes OpenACC async queues and CUDA streams
30 | * openacc\_cuda\_device - Calls a CUDA \_\_device\_\_ kernel within an OpenACC
31 |   region
32 | 


--------------------------------------------------------------------------------
/acc_malloc.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include <openacc.h>
 5 | 
 6 | void saxpy(int,float,float*,float*);
 7 | void set(int,float,float*);
 8 | 
 9 | int main(int argc, char **argv)
10 | {
11 |   float *x, *y, tmp;
12 |   int n = 1<<20;
13 | 
14 |   x = acc_malloc((size_t)n*sizeof(float));
15 |   y = acc_malloc((size_t)n*sizeof(float));
16 | 
17 |   set(n,1.0f,x);
18 |   set(n,0.0f,y);
19 | 
20 |   saxpy(n, 2.0, x, y);
21 |   acc_memcpy_from_device(&tmp,y,(size_t)sizeof(float));
22 |   acc_free(x);
23 |   acc_free(y);
24 |   printf("%f\n",tmp);
25 |   return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/cuda_main.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | extern "C" void saxpy(int,float,float*,float*);
 7 | extern "C" void set(int,float,float*);
 8 | 
 9 | int main(int argc, char **argv)
10 | {
11 |   float *x, *y, tmp;
12 |   int n = 1<<20;
13 | 
14 |   cudaMalloc((void**)&x,(size_t)n*sizeof(float));
15 |   cudaMalloc((void**)&y,(size_t)n*sizeof(float));
16 | 
17 |   set(n,1.0f,x);
18 |   set(n,0.0f,y);
19 | 
20 |   saxpy(n, 2.0, x, y);
21 |   cudaMemcpy(&tmp,y,(size_t)sizeof(float),cudaMemcpyDeviceToHost);
22 |   printf("%f\n",tmp);
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/cuda_map.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | extern "C" void saxpy(int,float,float*,float*);
 7 | extern "C" void set(int,float,float*);
 8 | extern "C" void map(float*, float*, int);
 9 | 
10 | int main(int argc, char **argv)
11 | {
12 |   float *x, *y, *dx, *dy, tmp;
13 |   int n = 1<<20;
14 | 
15 |   x = (float*) malloc(n*sizeof(float));
16 |   y = (float*) malloc(n*sizeof(float));
17 |   cudaMalloc((void**)&dx,(size_t)n*sizeof(float));
18 |   cudaMalloc((void**)&dy,(size_t)n*sizeof(float));
19 | 
20 |   map(x, dx, n*sizeof(float));
21 |   map(y, dy, n*sizeof(float));
22 | 
23 |   set(n,1.0f,x);
24 |   set(n,0.0f,y);
25 | 
26 |   saxpy(n, 2.0, x, y);
27 |   cudaMemcpy(&tmp,dy,(size_t)sizeof(float),cudaMemcpyDeviceToHost);
28 |   printf("%f\n",tmp);
29 |   return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/cuf_main.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   integer, parameter :: N = 2**20
 3 |   ! Allocate X and Y only on the device
 4 |   real, device, dimension(N) :: X, Y
 5 |   integer :: i
 6 |   real :: tmp
 7 | 
 8 |   ! CUDA Fortran will automatically convert these to run on the device
 9 |   X(:) = 1.0
10 |   Y(:) = 0.0
11 | 
12 |   !$acc kernels deviceptr(x,y)
13 |   y(:) = y(:) + 2.0*x(:)
14 |   !$acc end kernels
15 | 
16 |   ! Copy the first element back from Y for correctness checking
17 |   tmp = y(1)
18 |   print *, tmp
19 | end program
20 | 


--------------------------------------------------------------------------------
/kernels.cuf:
--------------------------------------------------------------------------------
 1 | module saxpy_mod 
 2 |   contains
 3 |   attributes(global) &
 4 |   subroutine saxpy_kernel(n, a, x, y)
 5 |     real :: x(:), y(:), a
 6 |     integer :: n,i
 7 |     attributes(value) :: a,n
 8 |     i = threadIdx%x+(blockIdx%x-1)*blockDim%x
 9 |     if (i<=n) y(i) = y(i) + a*x(i)
10 |   end subroutine
11 | subroutine saxpy (n, a, x, y)
12 |   use cudafor
13 |   real, device :: x(:), y(:)
14 |   real :: a
15 |   integer :: n
16 |   call saxpy_kernel<<<4096,256>>>(n, a, x, y)
17 | end subroutine
18 | end module saxpy_mod
19 | 


--------------------------------------------------------------------------------
/openacc_c_cublas.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | 
 5 | extern void cublasSaxpy(int,float,float*,int,float*,int);
 6 | 
 7 | int main(int argc, char **argv)
 8 | {
 9 |   float *x, *y, tmp;
10 |   int n = 1<<20, i;
11 | 
12 |   x = (float*)malloc(n*sizeof(float));
13 |   y = (float*)malloc(n*sizeof(float));
14 | 
15 |   #pragma acc data create(x[0:n]) copyout(y[0:n])
16 |   {
17 |     #pragma acc kernels
18 |     {
19 |       for( i = 0; i < n; i++)
20 |       {
21 |         x[i] = 1.0f;
22 |         y[i] = 0.0f;
23 |       }
24 |     }
25 |       
26 |     #pragma acc host_data use_device(x,y)
27 |     {
28 |       cublasSaxpy(n, 2.0, x, 1, y, 1);
29 |     }
30 |   }
31 | 
32 |   fprintf(stdout, "y[0] = %f\n",y[0]);
33 |   return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/openacc_c_cublas_v2.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include "cublas_v2.h"
 5 | 
 6 | int main(int argc, char **argv)
 7 | {
 8 |   float *x, *y, tmp;
 9 |   int n = 2000, i;
10 |   cublasStatus_t stat = CUBLAS_STATUS_SUCCESS;
11 | 
12 |   x = (float*)malloc(n*sizeof(float));
13 |   y = (float*)malloc(n*sizeof(float));
14 | 
15 |   #pragma acc enter data create(x[0:n]) create(y[0:n])
16 |       
17 |       cublasHandle_t handle;
18 |       stat = cublasCreate(&handle);
19 |       if ( CUBLAS_STATUS_SUCCESS != stat ) {
20 |         printf("CUBLAS initialization failed\n");
21 |       }
22 | 
23 |     #pragma acc kernels
24 |     {
25 |       for( i = 0; i < n; i++)
26 |       {
27 |         x[i] = 1.0f;
28 |         y[i] = 0.0f;
29 |       }
30 |     }
31 | 
32 |     #pragma acc host_data use_device(x,y)
33 |     {
34 |         const float alpha = 2.0f;
35 |         stat = cublasSaxpy(handle, n, &alpha, x, 1, y, 1);
36 |         if (stat != CUBLAS_STATUS_SUCCESS) {
37 |             printf("cublasSaxpy failed\n");
38 |         } 
39 |         stat = cublasSnrm2(handle, n, x, 1, &tmp); 
40 |         if (stat != CUBLAS_STATUS_SUCCESS) {
41 |             printf("cublasSnrm2 failed\n");
42 |         }
43 |     }
44 |     
45 |     cublasDestroy(handle);
46 |   
47 | #pragma acc exit data copyout(x[0:n]) copyout(y[0:n]) 
48 | 
49 |   fprintf(stdout, "y[0] = %f\n",y[0]);
50 |   fprintf(stdout, "x[0] = %f\n",x[0]);
51 |   fprintf(stdout, "norm2(x) = %f\n",tmp);
52 |   return 0;
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/openacc_c_main.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | 
 5 | extern void saxpy(int,float,float*,float*);
 6 | 
 7 | int main(int argc, char **argv)
 8 | {
 9 |   float *x, *y, tmp;
10 |   int n = 1<<20, i;
11 | 
12 |   x = (float*)malloc(n*sizeof(float));
13 |   y = (float*)malloc(n*sizeof(float));
14 | 
15 |   #pragma acc data create(x[0:n]) copyout(y[0:n])
16 |   {
17 |     #pragma acc kernels
18 |     {
19 |       for( i = 0; i < n; i++)
20 |       {
21 |         x[i] = 1.0f;
22 |         y[i] = 0.0f;
23 |       }
24 |     }
25 |       
26 |     #pragma acc host_data use_device(x,y)
27 |     {
28 |       saxpy(n, 2.0, x, y);
29 |     }
30 |   }
31 | 
32 |   fprintf(stdout, "y[0] = %f\n",y[0]);
33 |   return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/openacc_cublas.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cublas
 3 |   implicit none
 4 |   integer, parameter :: N = 2**20
 5 |   real, dimension(N) :: X, Y
 6 |   real:: nrm2
 7 |   type(cublasHandle) :: h
 8 |   integer :: istat
 9 |   
10 |   istat = cublasCreate(h)
11 |   if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
12 | 
13 |   !$acc data create(x,y)
14 |   !$acc kernels
15 |   X(:) = 1.0
16 |   Y(:) = 0.0
17 |   !$acc end kernels
18 | 
19 |   !$acc host_data use_device(x,y)
20 |   call cublassaxpy(N, 2.0, x, 1, y, 1)
21 |   istat = cublasSnrm2_v2(h, N, x, 1, nrm2)
22 |   if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
23 |   !$acc end host_data
24 |   
25 |   !$acc update self(y)
26 |   !$acc end data
27 | 
28 |   istat = cublasDestroy(h)
29 |   print *, y(1)
30 |   print*,nrm2
31 | end program
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/openacc_cuda_device.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | 
 5 | #pragma acc routine seq
 6 | extern "C" float saxpy_dev(float, float, float);
 7 | 
 8 | int main(int argc, char **argv)
 9 | {
10 |   float *x, *y, tmp;
11 |   int n = 1<<20, i;
12 | 
13 |   x = (float*)malloc(n*sizeof(float));
14 |   y = (float*)malloc(n*sizeof(float));
15 | 
16 |   #pragma acc data create(x[0:n]) copyout(y[0:n])
17 |   {
18 |     #pragma acc kernels
19 |     {
20 |       for( i = 0; i < n; i++)
21 |       {
22 |         x[i] = 1.0f;
23 |         y[i] = 0.0f;
24 |       }
25 |     }
26 |       
27 | #pragma acc parallel loop
28 |     for( i = 0; i < n; i++ )
29 |     {
30 |       y[i] = saxpy_dev(2.0, x[i], y[i]);
31 |     }
32 |   }
33 | 
34 |   fprintf(stdout, "y[0] = %f\n",y[0]);
35 |   return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/openacc_main.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use saxpy_mod
 3 |   integer, parameter :: N = 2**20
 4 |   real, dimension(N) :: X, Y
 5 | 
 6 |   X(:) = 1.0
 7 |   Y(:) = 0.0
 8 | 
 9 |   !$acc data copy(y) copyin(x)
10 |   call saxpy(N, 2.0, x, y)
11 |   !$acc end data
12 | 
13 |   print *, y(1)
14 | end program
15 | 


--------------------------------------------------------------------------------
/openacc_streams.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | extern void saxpy(int,float,float*,float*,cudaStream_t);
 7 | 
 8 | int main(int argc, char **argv)
 9 | {
10 |   float *x, *y, tmp;
11 |   int n = 1<<20, i;
12 |   cudaStream_t stream;
13 | 
14 |   x = (float*)malloc(n*sizeof(float));
15 |   y = (float*)malloc(n*sizeof(float));
16 | 
17 |   stream = (cudaStream_t) acc_get_cuda_stream(1);
18 | 
19 |   #pragma acc data create(x[0:n],y[0:n])
20 |   {
21 |     #pragma acc kernels async(1)
22 |     {
23 |       for( i = 0; i < n; i++)
24 |       {
25 |         x[i] = 1.0f;
26 |         y[i] = 0.0f;
27 |       }
28 |     }
29 |       
30 |     #pragma acc host_data use_device(x,y)
31 |     {
32 |       saxpy(n, 2.0, x, y, stream);
33 |     }
34 | 
35 |     #pragma acc update self(y[0:n]) async(1)
36 |     #pragma acc wait(1)
37 |   }
38 | 
39 |   fprintf(stdout, "y[0] = %f\n",y[0]);
40 |   return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/saxpy_cuda.cu:
--------------------------------------------------------------------------------
 1 | __global__
 2 | void saxpy_kernel(int n, float a, float *x, float *y)
 3 | {
 4 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 5 | 
 6 |   if ( i < n )
 7 |     y[i] += a * x[i];
 8 | }
 9 | extern "C" void saxpy(int n ,float a, float *x, float *y)
10 | {
11 |   dim3 griddim, blockdim;
12 | 
13 |   blockdim = dim3(128,1,1);
14 |   griddim = dim3(n/blockdim.x,1,1);
15 | 
16 |   saxpy_kernel<<<griddim,blockdim>>>(n,a,x,y);
17 | }
18 | 


--------------------------------------------------------------------------------
/saxpy_cuda_async.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | __global__
 4 | void saxpy_kernel(int n, float a, float *x, float *y)
 5 | {
 6 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 7 | 
 8 |   if ( i < n )
 9 |     y[i] += a * x[i];
10 | }
11 | extern "C" void saxpy(int n ,float a, float *x, float *y, cudaStream_t stream)
12 | {
13 |   dim3 griddim, blockdim;
14 | 
15 |   blockdim = dim3(128,1,1);
16 |   griddim = dim3(n/blockdim.x,1,1);
17 | 
18 |   saxpy_kernel<<<griddim,blockdim,0,stream>>>(n,a,x,y);
19 | }
20 | 


--------------------------------------------------------------------------------
/saxpy_cuda_device.cu:
--------------------------------------------------------------------------------
1 | extern "C"
2 | __device__ 
3 | float saxpy_dev(float a, float x, float y)
4 | {
5 |   return a * x + y;
6 | }
7 | 


--------------------------------------------------------------------------------
/saxpy_openacc_c.c:
--------------------------------------------------------------------------------
 1 | void saxpy(int n, float a, float * restrict x, float * restrict y)
 2 | {
 3 |   #pragma acc kernels deviceptr(x,y)
 4 |   {
 5 |     for(int i=0; i<n; i++)
 6 |     {
 7 |       y[i] += a*x[i];
 8 |     }
 9 |   }
10 | }
11 | void set(int n, float val, float * restrict arr)
12 | {
13 | #pragma acc kernels deviceptr(arr)
14 |   {
15 |     for(int i=0; i<n; i++)
16 |     {
17 |       arr[i] = val;
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/saxpy_openacc_c_mapped.c:
--------------------------------------------------------------------------------
 1 | #include <openacc.h>
 2 | 
 3 | void map(float * restrict harr, float * restrict darr, int size)
 4 | {
 5 |   acc_map_data(harr, darr, size);
 6 | }
 7 | void saxpy(int n, float a, float * restrict x, float * restrict y)
 8 | {
 9 |   #pragma acc kernels present(x,y)
10 |   {
11 |     for(int i=0; i<n; i++)
12 |     {
13 |       y[i] += a*x[i];
14 |     }
15 |   }
16 | }
17 | void set(int n, float val, float * restrict arr)
18 | {
19 | #pragma acc kernels present(arr)
20 |   {
21 |     for(int i=0; i<n; i++)
22 |     {
23 |       arr[i] = val;
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/saxpy_openacc_f.f90:
--------------------------------------------------------------------------------
 1 | module saxpy_mod
 2 |   contains
 3 |   subroutine saxpy(n, a, x, y)
 4 |     integer :: n
 5 |     real    :: a, x(:), y(:)
 6 |     !$acc parallel loop deviceptr(x,y)
 7 |     y(:) = y(:) + a * x(:)
 8 |     !$acc end parallel
 9 |   end subroutine
10 | end module
11 | 


--------------------------------------------------------------------------------
/thrust.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <thrust/device_vector.h>
 4 | #include <thrust/device_ptr.h>
 5 | 
 6 | extern "C" void saxpy(int,float,float*,float*);
 7 | 
 8 | int main(int argc, char **argv)
 9 | {
10 |   int N = 1<<20;
11 |   thrust::host_vector<float> y(N);
12 | 
13 |   thrust::device_vector<float> d_x(N);
14 |   thrust::device_vector<float> d_y(N);
15 | 
16 |   thrust::fill(d_x.begin(),d_x.end(), 1.0f);
17 |   thrust::fill(d_y.begin(),d_y.end(), 0.0f);
18 | 
19 |   saxpy(N,2.0,thrust::raw_pointer_cast(d_x.data()),thrust::raw_pointer_cast(d_y.data()));
20 | 
21 |   y = d_y;
22 |   printf("%f\n",y[0]);
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------