├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── acc_malloc.c ├── cuda_main.cu ├── cuda_map.cu ├── cuf_main.cuf ├── kernels.cuf ├── openacc_c_cublas.c ├── openacc_c_cublas_v2.c ├── openacc_c_main.c ├── openacc_cublas.f90 ├── openacc_cuda_device.cpp ├── openacc_main.f90 ├── openacc_streams.c ├── saxpy_cuda.cu ├── saxpy_cuda_async.cu ├── saxpy_cuda_device.cu ├── saxpy_openacc_c.c ├── saxpy_openacc_c_mapped.c ├── saxpy_openacc_f.f90 └── thrust.cu /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | core 3 | .*.swp 4 | a.out 5 | *.mod 6 | *.lst 7 | *.ptx 8 | *.cub 9 | cuda_profile_*.log 10 | cuf_main 11 | thrust 12 | cuf_openacc_main 13 | cuda_main 14 | openacc_cublas 15 | openacc_c_cublas 16 | openacc_c_main 17 | cuda_map 18 | openacc_streams 19 | openacc_cuda_device 20 | acc_malloc 21 | openacc_c_cublas_v2 22 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2013, Jeff Larkin 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | EXES=cuda_main openacc_c_main openacc_c_cublas openacc_c_cublas_v2 thrust cuda_map acc_malloc openacc_streams openacc_cuda_device 2 | 3 | ifeq "$(PE_ENV)" "CRAY" 4 | # Cray Compiler 5 | CXX=CC 6 | CXXFLAGS=-hlist=a 7 | CC=cc 8 | CFLAGS=-hlist=a 9 | CUDAC=nvcc 10 | CUDAFLAGS=-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 11 | FC=ftn 12 | FFLAGS=-ra 13 | LDFLAGS=-L$(CUDA_HOME)/lib64 -lcudart 14 | else 15 | # PGI Compiler 16 | EXES+=cuf_main cuf_openacc_main openacc_cublas 17 | CXX=nvc++ 18 | CXXFLAGS=-fast -acc -Minfo=accel -gpu=cc60,cc70,cc75,cc80 19 | CC=nvc 20 | CFLAGS=-fast -acc -Minfo=accel -gpu=cc60,cc70,cc75,cc80 21 | CUDAC=nvcc 22 | # Hard-coded architectures to avoid build issue when arches are added or 23 | # removed from compilers 24 | CUDAFLAGS=-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 25 | FC=nvfortran 26 | FFLAGS=-fast -acc -Minfo=accel -gpu=cc60,cc70,cc75,cc80 27 | LDFLAGS=-Mcuda 28 | endif 29 | 30 | all: $(EXES) 31 | 32 | openacc_cublas: openacc_cublas.o 33 | $(FC) -o $@ $(CFLAGS) $^ $(LDFLAGS) -Mcudalib=cublas 34 | 35 | openacc_c_cublas: openacc_c_cublas.o 36 | $(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS) -Mcudalib=cublas 37 | 38 | openacc_c_cublas_v2: openacc_c_cublas_v2.o 39 | $(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS) -Mcudalib=cublas 40 | 41 | openacc_c_main: saxpy_cuda.o openacc_c_main.o 42 | $(CXX) -o $@ $(CFLAGS) $^ $(LDFLAGS) 43 | 44 | cuda_main: saxpy_openacc_c.o cuda_main.o 45 | $(CXX) -o $@ $(CFLAGS) $^ $(LDFLAGS) 46 | 47 | cuf_main: cuf_main.o 48 | $(FC) -o $@ $(FFLAGS) $^ $(LDFLAGS) -Mcuda 49 | 50 | cuf_openacc_main: kernels.o openacc_main.o 51 | $(FC) -o $@ $(FFLAGS) $^ $(LDFLAGS) -Mcuda 52 | 53 | thrust: saxpy_openacc_c.o thrust.o 54 | $(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS) -lstdc++ 55 | 56 | cuda_map: saxpy_openacc_c_mapped.o cuda_map.o 57 | $(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS) 58 | 59 | acc_malloc: saxpy_openacc_c.o acc_malloc.o 60 | $(CC) -o $@ $(CXXFLAGS) $^ $(LDFLAGS) 61 | 62 | openacc_cuda_device: saxpy_cuda_device.o openacc_cuda_device.o 63 | $(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS) 64 | 65 | saxpy_cuda_device.o: saxpy_cuda_device.cu 66 | $(CUDAC) $(CUDAFLAGS) -rdc true -c $< 67 | 68 | openacc_streams: saxpy_cuda_async.o openacc_streams.o 69 | $(CXX) -o $@ $(CXXFLAGS) $^ $(LDFLAGS) 70 | 71 | .SUFFIXES: 72 | .SUFFIXES: .c .o .f90 .cu .cpp .cuf 73 | .c.o: 74 | $(CC) $(CFLAGS) -c $< 75 | .cpp.o: 76 | $(CXX) $(CXXFLAGS) -c $< 77 | .f90.o: 78 | $(FC) $(FFLAGS) -c $< 79 | .cuf.o: 80 | $(FC) $(FFLAGS) -c $< 81 | .cu.o: 82 | $(CUDAC) $(CUDAFLAGS) -c $< 83 | .PHONY: clean 84 | clean: 85 | rm -rf *.o *.ptx *.cub *.lst *.mod $(EXES) 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Stupid OpenACC (Interoperability) Tricks 2 | ======================================== 3 | Author: Jeff Larkin 4 | 5 | This repository demonstrates interoperability between OpenACC and various other 6 | GPU programming models. An OpenACC-enabled compiler is required. The default 7 | makefile has been written for PGI and tested with PGI 20.9, although most if 8 | not all examples will work with earlier versions. 9 | 10 | If building with the Cray Compiler Environment the makefile will detect this 11 | and adjust compiler flags and targets accordingly. Some targets rely on PGI 12 | CUDA Fortran features, these targets will be disabled when building with CCE. 13 | 14 | Build Instructions: 15 | ------------------- 16 | $ make 17 | 18 | Examples 19 | -------- 20 | * cuda\_main - calling OpenACC from CUDA C 21 | * openacc\_c\_main - Calling CUDA from OpenACC in C 22 | * openacc\_c\_cublas - Calling CUBLAS from OpenACC in C 23 | * thrust - Mixing OpenACC and Thrust in C++ 24 | * cuda\_map - Using OpenACC acc\_map\_data with CUDA in C 25 | * cuf\_main - Calling OpenACC from CUDA Fortran 26 | * cuf\_openacc\_main - Calling CUDA Fortran from OpenACC 27 | * openacc\_cublas - Calling CUBLAS from OpenACC in CUDA Fortran 28 | * acc\_malloc - Same as cuda\_main, but using the OpenACC API 29 | * openacc\_streams - Mixes OpenACC async queues and CUDA streams 30 | * openacc\_cuda\_device - Calls a CUDA \_\_device\_\_ kernel within an OpenACC 31 | region 32 | -------------------------------------------------------------------------------- /acc_malloc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | void saxpy(int,float,float*,float*); 7 | void set(int,float,float*); 8 | 9 | int main(int argc, char **argv) 10 | { 11 | float *x, *y, tmp; 12 | int n = 1<<20; 13 | 14 | x = acc_malloc((size_t)n*sizeof(float)); 15 | y = acc_malloc((size_t)n*sizeof(float)); 16 | 17 | set(n,1.0f,x); 18 | set(n,0.0f,y); 19 | 20 | saxpy(n, 2.0, x, y); 21 | acc_memcpy_from_device(&tmp,y,(size_t)sizeof(float)); 22 | acc_free(x); 23 | acc_free(y); 24 | printf("%f\n",tmp); 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /cuda_main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | extern "C" void saxpy(int,float,float*,float*); 7 | extern "C" void set(int,float,float*); 8 | 9 | int main(int argc, char **argv) 10 | { 11 | float *x, *y, tmp; 12 | int n = 1<<20; 13 | 14 | cudaMalloc((void**)&x,(size_t)n*sizeof(float)); 15 | cudaMalloc((void**)&y,(size_t)n*sizeof(float)); 16 | 17 | set(n,1.0f,x); 18 | set(n,0.0f,y); 19 | 20 | saxpy(n, 2.0, x, y); 21 | cudaMemcpy(&tmp,y,(size_t)sizeof(float),cudaMemcpyDeviceToHost); 22 | printf("%f\n",tmp); 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /cuda_map.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | extern "C" void saxpy(int,float,float*,float*); 7 | extern "C" void set(int,float,float*); 8 | extern "C" void map(float*, float*, int); 9 | 10 | int main(int argc, char **argv) 11 | { 12 | float *x, *y, *dx, *dy, tmp; 13 | int n = 1<<20; 14 | 15 | x = (float*) malloc(n*sizeof(float)); 16 | y = (float*) malloc(n*sizeof(float)); 17 | cudaMalloc((void**)&dx,(size_t)n*sizeof(float)); 18 | cudaMalloc((void**)&dy,(size_t)n*sizeof(float)); 19 | 20 | map(x, dx, n*sizeof(float)); 21 | map(y, dy, n*sizeof(float)); 22 | 23 | set(n,1.0f,x); 24 | set(n,0.0f,y); 25 | 26 | saxpy(n, 2.0, x, y); 27 | cudaMemcpy(&tmp,dy,(size_t)sizeof(float),cudaMemcpyDeviceToHost); 28 | printf("%f\n",tmp); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /cuf_main.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | integer, parameter :: N = 2**20 3 | ! Allocate X and Y only on the device 4 | real, device, dimension(N) :: X, Y 5 | integer :: i 6 | real :: tmp 7 | 8 | ! CUDA Fortran will automatically convert these to run on the device 9 | X(:) = 1.0 10 | Y(:) = 0.0 11 | 12 | !$acc kernels deviceptr(x,y) 13 | y(:) = y(:) + 2.0*x(:) 14 | !$acc end kernels 15 | 16 | ! Copy the first element back from Y for correctness checking 17 | tmp = y(1) 18 | print *, tmp 19 | end program 20 | -------------------------------------------------------------------------------- /kernels.cuf: -------------------------------------------------------------------------------- 1 | module saxpy_mod 2 | contains 3 | attributes(global) & 4 | subroutine saxpy_kernel(n, a, x, y) 5 | real :: x(:), y(:), a 6 | integer :: n,i 7 | attributes(value) :: a,n 8 | i = threadIdx%x+(blockIdx%x-1)*blockDim%x 9 | if (i<=n) y(i) = y(i) + a*x(i) 10 | end subroutine 11 | subroutine saxpy (n, a, x, y) 12 | use cudafor 13 | real, device :: x(:), y(:) 14 | real :: a 15 | integer :: n 16 | call saxpy_kernel<<<4096,256>>>(n, a, x, y) 17 | end subroutine 18 | end module saxpy_mod 19 | -------------------------------------------------------------------------------- /openacc_c_cublas.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern void cublasSaxpy(int,float,float*,int,float*,int); 6 | 7 | int main(int argc, char **argv) 8 | { 9 | float *x, *y, tmp; 10 | int n = 1<<20, i; 11 | 12 | x = (float*)malloc(n*sizeof(float)); 13 | y = (float*)malloc(n*sizeof(float)); 14 | 15 | #pragma acc data create(x[0:n]) copyout(y[0:n]) 16 | { 17 | #pragma acc kernels 18 | { 19 | for( i = 0; i < n; i++) 20 | { 21 | x[i] = 1.0f; 22 | y[i] = 0.0f; 23 | } 24 | } 25 | 26 | #pragma acc host_data use_device(x,y) 27 | { 28 | cublasSaxpy(n, 2.0, x, 1, y, 1); 29 | } 30 | } 31 | 32 | fprintf(stdout, "y[0] = %f\n",y[0]); 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /openacc_c_cublas_v2.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "cublas_v2.h" 5 | 6 | int main(int argc, char **argv) 7 | { 8 | float *x, *y, tmp; 9 | int n = 2000, i; 10 | cublasStatus_t stat = CUBLAS_STATUS_SUCCESS; 11 | 12 | x = (float*)malloc(n*sizeof(float)); 13 | y = (float*)malloc(n*sizeof(float)); 14 | 15 | #pragma acc enter data create(x[0:n]) create(y[0:n]) 16 | 17 | cublasHandle_t handle; 18 | stat = cublasCreate(&handle); 19 | if ( CUBLAS_STATUS_SUCCESS != stat ) { 20 | printf("CUBLAS initialization failed\n"); 21 | } 22 | 23 | #pragma acc kernels 24 | { 25 | for( i = 0; i < n; i++) 26 | { 27 | x[i] = 1.0f; 28 | y[i] = 0.0f; 29 | } 30 | } 31 | 32 | #pragma acc host_data use_device(x,y) 33 | { 34 | const float alpha = 2.0f; 35 | stat = cublasSaxpy(handle, n, &alpha, x, 1, y, 1); 36 | if (stat != CUBLAS_STATUS_SUCCESS) { 37 | printf("cublasSaxpy failed\n"); 38 | } 39 | stat = cublasSnrm2(handle, n, x, 1, &tmp); 40 | if (stat != CUBLAS_STATUS_SUCCESS) { 41 | printf("cublasSnrm2 failed\n"); 42 | } 43 | } 44 | 45 | cublasDestroy(handle); 46 | 47 | #pragma acc exit data copyout(x[0:n]) copyout(y[0:n]) 48 | 49 | fprintf(stdout, "y[0] = %f\n",y[0]); 50 | fprintf(stdout, "x[0] = %f\n",x[0]); 51 | fprintf(stdout, "norm2(x) = %f\n",tmp); 52 | return 0; 53 | } 54 | 55 | -------------------------------------------------------------------------------- /openacc_c_main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern void saxpy(int,float,float*,float*); 6 | 7 | int main(int argc, char **argv) 8 | { 9 | float *x, *y, tmp; 10 | int n = 1<<20, i; 11 | 12 | x = (float*)malloc(n*sizeof(float)); 13 | y = (float*)malloc(n*sizeof(float)); 14 | 15 | #pragma acc data create(x[0:n]) copyout(y[0:n]) 16 | { 17 | #pragma acc kernels 18 | { 19 | for( i = 0; i < n; i++) 20 | { 21 | x[i] = 1.0f; 22 | y[i] = 0.0f; 23 | } 24 | } 25 | 26 | #pragma acc host_data use_device(x,y) 27 | { 28 | saxpy(n, 2.0, x, y); 29 | } 30 | } 31 | 32 | fprintf(stdout, "y[0] = %f\n",y[0]); 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /openacc_cublas.f90: -------------------------------------------------------------------------------- 1 | program main 2 | use cublas 3 | implicit none 4 | integer, parameter :: N = 2**20 5 | real, dimension(N) :: X, Y 6 | real:: nrm2 7 | type(cublasHandle) :: h 8 | integer :: istat 9 | 10 | istat = cublasCreate(h) 11 | if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat 12 | 13 | !$acc data create(x,y) 14 | !$acc kernels 15 | X(:) = 1.0 16 | Y(:) = 0.0 17 | !$acc end kernels 18 | 19 | !$acc host_data use_device(x,y) 20 | call cublassaxpy(N, 2.0, x, 1, y, 1) 21 | istat = cublasSnrm2_v2(h, N, x, 1, nrm2) 22 | if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat 23 | !$acc end host_data 24 | 25 | !$acc update self(y) 26 | !$acc end data 27 | 28 | istat = cublasDestroy(h) 29 | print *, y(1) 30 | print*,nrm2 31 | end program 32 | 33 | 34 | -------------------------------------------------------------------------------- /openacc_cuda_device.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #pragma acc routine seq 6 | extern "C" float saxpy_dev(float, float, float); 7 | 8 | int main(int argc, char **argv) 9 | { 10 | float *x, *y, tmp; 11 | int n = 1<<20, i; 12 | 13 | x = (float*)malloc(n*sizeof(float)); 14 | y = (float*)malloc(n*sizeof(float)); 15 | 16 | #pragma acc data create(x[0:n]) copyout(y[0:n]) 17 | { 18 | #pragma acc kernels 19 | { 20 | for( i = 0; i < n; i++) 21 | { 22 | x[i] = 1.0f; 23 | y[i] = 0.0f; 24 | } 25 | } 26 | 27 | #pragma acc parallel loop 28 | for( i = 0; i < n; i++ ) 29 | { 30 | y[i] = saxpy_dev(2.0, x[i], y[i]); 31 | } 32 | } 33 | 34 | fprintf(stdout, "y[0] = %f\n",y[0]); 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /openacc_main.f90: -------------------------------------------------------------------------------- 1 | program main 2 | use saxpy_mod 3 | integer, parameter :: N = 2**20 4 | real, dimension(N) :: X, Y 5 | 6 | X(:) = 1.0 7 | Y(:) = 0.0 8 | 9 | !$acc data copy(y) copyin(x) 10 | call saxpy(N, 2.0, x, y) 11 | !$acc end data 12 | 13 | print *, y(1) 14 | end program 15 | -------------------------------------------------------------------------------- /openacc_streams.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | extern void saxpy(int,float,float*,float*,cudaStream_t); 7 | 8 | int main(int argc, char **argv) 9 | { 10 | float *x, *y, tmp; 11 | int n = 1<<20, i; 12 | cudaStream_t stream; 13 | 14 | x = (float*)malloc(n*sizeof(float)); 15 | y = (float*)malloc(n*sizeof(float)); 16 | 17 | stream = (cudaStream_t) acc_get_cuda_stream(1); 18 | 19 | #pragma acc data create(x[0:n],y[0:n]) 20 | { 21 | #pragma acc kernels async(1) 22 | { 23 | for( i = 0; i < n; i++) 24 | { 25 | x[i] = 1.0f; 26 | y[i] = 0.0f; 27 | } 28 | } 29 | 30 | #pragma acc host_data use_device(x,y) 31 | { 32 | saxpy(n, 2.0, x, y, stream); 33 | } 34 | 35 | #pragma acc update self(y[0:n]) async(1) 36 | #pragma acc wait(1) 37 | } 38 | 39 | fprintf(stdout, "y[0] = %f\n",y[0]); 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /saxpy_cuda.cu: -------------------------------------------------------------------------------- 1 | __global__ 2 | void saxpy_kernel(int n, float a, float *x, float *y) 3 | { 4 | int i = blockDim.x * blockIdx.x + threadIdx.x; 5 | 6 | if ( i < n ) 7 | y[i] += a * x[i]; 8 | } 9 | extern "C" void saxpy(int n ,float a, float *x, float *y) 10 | { 11 | dim3 griddim, blockdim; 12 | 13 | blockdim = dim3(128,1,1); 14 | griddim = dim3(n/blockdim.x,1,1); 15 | 16 | saxpy_kernel<<>>(n,a,x,y); 17 | } 18 | -------------------------------------------------------------------------------- /saxpy_cuda_async.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ 4 | void saxpy_kernel(int n, float a, float *x, float *y) 5 | { 6 | int i = blockDim.x * blockIdx.x + threadIdx.x; 7 | 8 | if ( i < n ) 9 | y[i] += a * x[i]; 10 | } 11 | extern "C" void saxpy(int n ,float a, float *x, float *y, cudaStream_t stream) 12 | { 13 | dim3 griddim, blockdim; 14 | 15 | blockdim = dim3(128,1,1); 16 | griddim = dim3(n/blockdim.x,1,1); 17 | 18 | saxpy_kernel<<>>(n,a,x,y); 19 | } 20 | -------------------------------------------------------------------------------- /saxpy_cuda_device.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __device__ 3 | float saxpy_dev(float a, float x, float y) 4 | { 5 | return a * x + y; 6 | } 7 | -------------------------------------------------------------------------------- /saxpy_openacc_c.c: -------------------------------------------------------------------------------- 1 | void saxpy(int n, float a, float * restrict x, float * restrict y) 2 | { 3 | #pragma acc kernels deviceptr(x,y) 4 | { 5 | for(int i=0; i 2 | 3 | void map(float * restrict harr, float * restrict darr, int size) 4 | { 5 | acc_map_data(harr, darr, size); 6 | } 7 | void saxpy(int n, float a, float * restrict x, float * restrict y) 8 | { 9 | #pragma acc kernels present(x,y) 10 | { 11 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | 6 | extern "C" void saxpy(int,float,float*,float*); 7 | 8 | int main(int argc, char **argv) 9 | { 10 | int N = 1<<20; 11 | thrust::host_vector y(N); 12 | 13 | thrust::device_vector d_x(N); 14 | thrust::device_vector d_y(N); 15 | 16 | thrust::fill(d_x.begin(),d_x.end(), 1.0f); 17 | thrust::fill(d_y.begin(),d_y.end(), 0.0f); 18 | 19 | saxpy(N,2.0,thrust::raw_pointer_cast(d_x.data()),thrust::raw_pointer_cast(d_y.data())); 20 | 21 | y = d_y; 22 | printf("%f\n",y[0]); 23 | return 0; 24 | } 25 | --------------------------------------------------------------------------------