├── .gitignore ├── .gitmodules ├── HIP-Examples-Applications ├── BinomialOption │ ├── BinomialOption.cpp │ └── Makefile ├── BitonicSort │ ├── BitonicSort.cpp │ └── Makefile ├── FastWalshTransform │ ├── FastWalshTransform.cpp │ └── Makefile ├── FloydWarshall │ ├── FloydWarshall.cpp │ └── Makefile ├── HelloWorld │ ├── HelloWorld.cpp │ └── Makefile ├── Histogram │ ├── Histogram.cpp │ ├── Histogram.hpp │ └── Makefile ├── MatrixMultiplication │ ├── Makefile │ └── MatrixMultiplication.cpp ├── PrefixSum │ ├── Makefile │ └── PrefixSum.cpp ├── RecursiveGaussian │ ├── Makefile │ ├── RecursiveGaussian.cpp │ ├── RecursiveGaussian.hpp │ ├── RecursiveGaussian_Input.bmp │ └── RecursiveGaussian_Output.bmp ├── SimpleConvolution │ ├── FilterCoeff.h │ ├── Makefile │ ├── SimpleConvolution.cpp │ └── SimpleConvolution.hpp ├── dct │ ├── Makefile │ └── dct.cpp ├── dwtHaar1D │ ├── Makefile │ └── dwtHaar1D.cpp └── include │ ├── HIPUtil.hpp │ ├── SDKBitMap.hpp │ ├── SDKFile.hpp │ ├── SDKThread.hpp │ └── SDKUtil.hpp ├── README.md ├── add4 ├── LICENSE ├── Makefile ├── README.md ├── buildit.sh ├── common.cpp ├── common.h ├── hip-stream.cpp ├── run_sweep.pl └── runhip.sh ├── common ├── hip.all.make └── hip.prologue.make ├── cuda-stream ├── Makefile ├── Makefile.titan ├── README.md └── stream.cpp ├── gpu-burn ├── AmdGpuMonitor.cpp ├── AmdGpuMonitor.h ├── BurnKernel.cpp ├── BurnKernel.h ├── GpuMonitor.h ├── Makefile ├── common.cpp ├── common.h └── gpuburn.cpp ├── mini-nbody ├── LICENSE ├── README.md ├── cuda │ ├── nbody-block.cu │ ├── nbody-orig.cu │ ├── nbody-soa.cu │ ├── nbody-unroll.cu │ ├── shmoo-cuda-nbody-block.sh │ ├── shmoo-cuda-nbody-ftz.sh │ ├── shmoo-cuda-nbody-orig.sh │ ├── shmoo-cuda-nbody-soa.sh │ └── shmoo-cuda-nbody-unroll.sh ├── hip │ ├── HIP-nbody-block.sh │ ├── HIP-nbody-orig.sh │ ├── HIP-nbody-soa.sh │ ├── nbody-block.cpp │ ├── nbody-orig.cpp │ └── nbody-soa.cpp ├── mic │ ├── nbody-align.c │ ├── nbody-block.c │ ├── nbody-soa.c │ ├── shmoo-mic-nbody-align.sh │ ├── shmoo-mic-nbody-block.sh │ ├── shmoo-mic-nbody-ftz.sh │ ├── shmoo-mic-nbody-orig.sh │ └── shmoo-mic-nbody-soa.sh ├── nbody.c ├── shmoo-cpu-nbody.sh └── timer.h ├── openmp-helloworld ├── CMakeLists.txt ├── Makefile ├── README.md └── openmp_helloworld.cpp ├── reduction ├── Makefile ├── README.md ├── reduction.cpp └── run.sh ├── rtm8 ├── Makefile ├── README.md ├── build_cuda.sh ├── build_fortran.sh ├── build_hip.sh ├── mysecond.c ├── rtm8.cpp ├── rtm8.cu └── rtm8.f ├── strided-access ├── CL │ ├── cl.h │ ├── cl.hpp │ ├── cl_d3d10.h │ ├── cl_ext.h │ ├── cl_gl.h │ ├── cl_gl_ext.h │ ├── cl_platform.h │ └── opencl.h ├── LICENSE.txt ├── Makefile ├── README.txt ├── benchmark-cuda.cu ├── benchmark-hip.cpp ├── benchmark-hip.cu ├── benchmark-opencl.cpp ├── benchmark-openmp.cpp ├── benchmark-openmp2.cpp ├── benchmark-utils.hpp └── results │ ├── LICENSE.txt │ ├── k20m.txt │ ├── plot.gnuplot │ ├── strided-access.eps │ ├── strided-access.pdf │ ├── strided-access.png │ ├── w9100.txt │ ├── xeon-e5-2670v3.txt │ └── xeon-phi-7120.txt ├── test_all.sh └── vectorAdd ├── Makefile ├── README └── vectoradd_hip.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | 3 | vectorAdd/vectoradd_hip.exe 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "mixbench"] 2 | path = mixbench 3 | url = https://github.com/ekondis/mixbench.git 4 | [submodule "GPU-STREAM"] 5 | path = GPU-STREAM 6 | url = https://github.com/UoB-HPC/GPU-STREAM.git 7 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/BinomialOption/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = BinomialOption.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./BinomialOption 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/BitonicSort/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = BitonicSort.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./BitonicSort 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/FastWalshTransform/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = FastWalshTransform.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./FastWalshTransform 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/FloydWarshall/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = FloydWarshall.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./FloydWarshall 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/HelloWorld/HelloWorld.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #define SAMPLE_VERSION "HIP-Examples-Application-v1.0" 32 | #define SUCCESS 0 33 | #define FAILURE 1 34 | 35 | using namespace std; 36 | 37 | __global__ void helloworld(char* in, char* out) 38 | { 39 | int num = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x; 40 | out[num] = in[num] + 1; 41 | } 42 | 43 | int main(int argc, char* argv[]) 44 | { 45 | 46 | hipDeviceProp_t devProp; 47 | hipGetDeviceProperties(&devProp, 0); 48 | cout << " System minor " << devProp.minor << endl; 49 | cout << " System major " << devProp.major << endl; 50 | cout << " agent prop name " << devProp.name << endl; 51 | 52 | /* Initial input,output for the host and create memory objects for the kernel*/ 53 | const char* input = "GdkknVnqkc"; 54 | size_t strlength = strlen(input); 55 | cout << "input string:" << endl; 56 | cout << input << endl; 57 | char *output = (char*) malloc(strlength + 1); 58 | 59 | char* inputBuffer; 60 | char* outputBuffer; 61 | hipMalloc((void**)&inputBuffer, (strlength + 1) * sizeof(char)); 62 | hipMalloc((void**)&outputBuffer, (strlength + 1) * sizeof(char)); 63 | 64 | hipMemcpy(inputBuffer, input, (strlength + 1) * sizeof(char), hipMemcpyHostToDevice); 65 | 66 | hipLaunchKernelGGL(helloworld, 67 | dim3(1), 68 | dim3(strlength), 69 | 0, 0, 70 | inputBuffer ,outputBuffer ); 71 | 72 | hipMemcpy(output, outputBuffer,(strlength + 1) * sizeof(char), hipMemcpyDeviceToHost); 73 | 74 | hipFree(inputBuffer); 75 | hipFree(outputBuffer); 76 | 77 | output[strlength] = '\0'; //Add the terminal character to the end of output. 78 | cout << "\noutput string:" << endl; 79 | cout << output << endl; 80 | 81 | free(output); 82 | 83 | std::cout<<"Passed!\n"; 84 | return SUCCESS; 85 | } 86 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/HelloWorld/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = HelloWorld.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./HelloWorld 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/Histogram/Histogram.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/Histogram/Histogram.cpp -------------------------------------------------------------------------------- /HIP-Examples-Applications/Histogram/Histogram.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/Histogram/Histogram.hpp -------------------------------------------------------------------------------- /HIP-Examples-Applications/Histogram/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = Histogram.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./Histogram 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/MatrixMultiplication/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = MatrixMultiplication.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./MatrixMultiplication 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/PrefixSum/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = PrefixSum.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./PrefixSum 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/RecursiveGaussian/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = RecursiveGaussian.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./RecursiveGaussian 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef RECURSIVE_GAUSSIAN_H_ 24 | #define RECURSIVE_GAUSSIAN_H_ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "../include/HIPUtil.hpp" 32 | #include "../include/SDKBitMap.hpp" 33 | 34 | using namespace appsdk; 35 | using namespace std; 36 | 37 | #define INPUT_IMAGE "RecursiveGaussian_Input.bmp" 38 | #define OUTPUT_IMAGE "RecursiveGaussian_Output.bmp" 39 | 40 | #define SAMPLE_VERSION "HIP-Examples-Applications-v1.0" 41 | 42 | #define GROUP_SIZE 256 43 | 44 | /** 45 | * Custom type for gaussian parameters 46 | * precomputation 47 | */ 48 | typedef struct _GaussParms 49 | { 50 | float nsigma; 51 | float alpha; 52 | float ema; 53 | float ema2; 54 | float b1; 55 | float b2; 56 | float a0; 57 | float a1; 58 | float a2; 59 | float a3; 60 | float coefp; 61 | float coefn; 62 | } GaussParms, *pGaussParms; 63 | 64 | 65 | 66 | /** 67 | * Recursive Gaussian 68 | * Class implements OpenRecursive Gaussian sample 69 | */ 70 | 71 | class RecursiveGaussian 72 | { 73 | double setupTime; /**< time taken to setup Openresources and building kernel */ 74 | double kernelTime; /**< time taken to run kernel and read result back */ 75 | 76 | uchar4* inputImageData; /**< Input bitmap data to device */ 77 | uchar4* outputImageData; /**< Output from device */ 78 | 79 | uchar4* inputImageBuffer; /**< memory buffer for input Image*/ 80 | uchar4* tempImageBuffer; /**< memory buffer for storing the transpose of the image*/ 81 | uchar4* outputImageBuffer; /**< memory buffer for Output Image*/ 82 | uchar4* 83 | verificationInput; /**< Input array for reference implementation */ 84 | uchar4* 85 | verificationOutput; /**< Output array for reference implementation */ 86 | 87 | SDKBitMap inputBitmap; /**< Bitmap class object */ 88 | uchar4* pixelData; /**< Pointer to image data */ 89 | unsigned int pixelSize; /**< Size of a pixel in BMP format> */ 90 | GaussParms 91 | oclGP; /**< instance of struct to hold gaussian parameters */ 92 | unsigned int width; /**< Width of image */ 93 | unsigned int height; /**< Height of image */ 94 | size_t blockSizeX; /**< Work-group size in x-direction */ 95 | size_t blockSizeY; /**< Work-group size in y-direction */ 96 | size_t blockSize; /**< block size for transpose kernel */ 97 | int iterations; /**< Number of iterations for kernel execution */ 98 | //uchar4 *din, *dout, *dtemp; 99 | 100 | SDKTimer *sampleTimer; /**< SDKTimer object */ 101 | 102 | public: 103 | 104 | HIPCommandArgs *sampleArgs; /**< HIPCommand argument class */ 105 | 106 | /** 107 | * Read bitmap image and allocate host memory 108 | * @param inputImageName name of the input file 109 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 110 | */ 111 | int readInputImage(std::string inputImageName); 112 | 113 | /** 114 | * Write output to an image file 115 | * @param outputImageName name of the output file 116 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 117 | */ 118 | int writeOutputImage(std::string outputImageName); 119 | 120 | /** 121 | * Preprocess gaussian parameters 122 | * @param fSigma sigma value 123 | * @param iOrder order 124 | * @param pGp pointer to gaussian parameter object 125 | */ 126 | void computeGaussParms(float fSigma, int iOrder, GaussParms* pGP); 127 | 128 | /** 129 | * RecursiveGaussian on CPU (for verification) 130 | * @param input input image 131 | * @param output output image 132 | * @param width width of image 133 | * @param height height of image 134 | * @param a0..a3, b1, b2, coefp, coefn gaussian parameters 135 | */ 136 | void recursiveGaussianCPU(uchar4* input, uchar4* output, 137 | const int width, const int height, 138 | const float a0, const float a1, 139 | const float a2, const float a3, 140 | const float b1, const float b2, 141 | const float coefp, const float coefn); 142 | 143 | /** 144 | * Transpose on CPU (for verification) 145 | * @param input input image 146 | * @param output output image 147 | * @param width width of input image 148 | * @param height height of input image 149 | */ 150 | void transposeCPU(uchar4* input, uchar4* output, 151 | const int width, const int height); 152 | 153 | /** 154 | * Constructor 155 | * Initialize member variables 156 | */ 157 | RecursiveGaussian() 158 | : inputImageData(NULL), 159 | outputImageData(NULL), 160 | verificationOutput(NULL) 161 | { 162 | sampleArgs = new HIPCommandArgs(); 163 | sampleTimer = new SDKTimer(); 164 | sampleArgs->sampleVerStr = SAMPLE_VERSION; 165 | pixelSize = sizeof(uchar4); 166 | pixelData = NULL; 167 | blockSizeX = GROUP_SIZE; 168 | blockSizeY = 1; 169 | blockSize = 1; 170 | iterations = 1; 171 | } 172 | 173 | ~RecursiveGaussian() 174 | { 175 | } 176 | 177 | inline long long get_time() 178 | { 179 | struct timeval tv; 180 | gettimeofday(&tv, 0); 181 | return (tv.tv_sec * 1000000) + tv.tv_usec; 182 | } 183 | 184 | /** 185 | * Allocate image memory and Load bitmap file 186 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 187 | */ 188 | int setupRecursiveGaussian(); 189 | 190 | /** 191 | * Openrelated initialisations. 192 | * Set up Context, Device list, Command Queue, Memory buffers 193 | * Build kernel program executable 194 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 195 | */ 196 | int setupHIP(); 197 | 198 | /** 199 | * Set values for kernels' arguments, enqueue calls to the kernels 200 | * on to the command queue, wait till end of kernel execution. 201 | * Get kernel start and end time if timing is enabled 202 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 203 | */ 204 | int runKernels(); 205 | 206 | /** 207 | * Reference CPU implementation of Binomial Option 208 | * for performance comparison 209 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 210 | */ 211 | void recursiveGaussianCPUReference(); 212 | 213 | /** 214 | * Override from SDKSample. Print sample stats. 215 | */ 216 | void printStats(); 217 | 218 | /** 219 | * Override from SDKSample. Initialize 220 | * command line parser, add custom options 221 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 222 | */ 223 | int initialize(); 224 | 225 | /** 226 | * Override from SDKSample, adjust width and height 227 | * of execution domain, perform all sample setup 228 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 229 | */ 230 | int setup(); 231 | 232 | /** 233 | * Override from SDKSample 234 | * Run OpenSobel Filter 235 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 236 | */ 237 | int run(); 238 | 239 | /** 240 | * Override from SDKSample 241 | * Cleanup memory allocations 242 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 243 | */ 244 | int cleanup(); 245 | 246 | /** 247 | * Override from SDKSample 248 | * Verify against reference implementation 249 | * @return SDK_SUCCESS on success and SDK_FAILURE on failure 250 | */ 251 | int verifyResults(); 252 | }; 253 | 254 | #endif // RECURSIVE_GAUSSIAN_H_ 255 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Input.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Input.bmp -------------------------------------------------------------------------------- /HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Output.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Output.bmp -------------------------------------------------------------------------------- /HIP-Examples-Applications/SimpleConvolution/FilterCoeff.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/SimpleConvolution/FilterCoeff.h -------------------------------------------------------------------------------- /HIP-Examples-Applications/SimpleConvolution/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = SimpleConvolution.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./SimpleConvolution 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | CXX=$(HIPCC) 16 | 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(HIPCC) $(OBJECTS) -o $@ 20 | 21 | 22 | test: $(EXECUTABLE) 23 | $(EXECUTABLE) 24 | 25 | 26 | clean: 27 | rm -f $(EXECUTABLE) 28 | rm -f $(OBJECTS) 29 | rm -f $(HIP_PATH)/src/*.o 30 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.cpp -------------------------------------------------------------------------------- /HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.hpp -------------------------------------------------------------------------------- /HIP-Examples-Applications/dct/Makefile: -------------------------------------------------------------------------------- 1 | HIPCC=/opt/rocm/bin/hipcc 2 | 3 | SOURCES = dct.cpp 4 | OBJECTS = $(SOURCES:.cpp=.o) 5 | 6 | EXECUTABLE=./dct 7 | 8 | .PHONY: test 9 | 10 | 11 | all: $(EXECUTABLE) test 12 | 13 | CXXFLAGS =-g 14 | CXX=$(HIPCC) 15 | 16 | 17 | $(EXECUTABLE): $(OBJECTS) 18 | $(HIPCC) $(OBJECTS) -o $@ 19 | 20 | 21 | test: $(EXECUTABLE) 22 | $(EXECUTABLE) 23 | 24 | 25 | clean: 26 | rm -f $(EXECUTABLE) 27 | rm -f $(OBJECTS) 28 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/dwtHaar1D/Makefile: -------------------------------------------------------------------------------- 1 | HIPCC=/opt/rocm/bin/hipcc 2 | 3 | SOURCES = dwtHaar1D.cpp 4 | OBJECTS = $(SOURCES:.cpp=.o) 5 | 6 | EXECUTABLE=./dwtHaar1D 7 | 8 | .PHONY: test 9 | 10 | 11 | all: $(EXECUTABLE) test 12 | 13 | CXXFLAGS =-g 14 | CXX=$(HIPCC) 15 | 16 | 17 | $(EXECUTABLE): $(OBJECTS) 18 | $(HIPCC) $(OBJECTS) -o $@ 19 | 20 | 21 | test: $(EXECUTABLE) 22 | $(EXECUTABLE) 23 | 24 | 25 | clean: 26 | rm -f $(EXECUTABLE) 27 | rm -f $(OBJECTS) 28 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/include/HIPUtil.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef HIPSAMPLE_H_ 24 | #define HIPSAMPLE_H_ 25 | 26 | /****************************************************************************** 27 | * Included header files * 28 | ******************************************************************************/ 29 | 30 | #include "SDKUtil.hpp" 31 | 32 | namespace appsdk 33 | { 34 | 35 | class HIPCommandArgs: public SDKCmdArgsParser 36 | { 37 | public: 38 | unsigned int deviceId; /**< Cmd Line Option- if deviceId */ 39 | bool enableDeviceId; /**< Cmd Line Option- if enableDeviceId */ 40 | 41 | /** 42 | *********************************************************************** 43 | * @fn initialize 44 | * @brief Initialize the resources used by tests 45 | * @return 0 on success Positive if expected and Non-zero on failure 46 | **********************************************************************/ 47 | int initialize() 48 | { 49 | int defaultOptions = 5; 50 | Option *optionList = new Option[defaultOptions]; 51 | CHECK_ALLOCATION(optionList, "Error. Failed to allocate memory (optionList)\n"); 52 | optionList[0]._sVersion = "q"; 53 | optionList[0]._lVersion = "quiet"; 54 | optionList[0]._description = "Quiet mode. Suppress all text output."; 55 | optionList[0]._type = CA_NO_ARGUMENT; 56 | optionList[0]._value = &quiet; 57 | optionList[1]._sVersion = "e"; 58 | optionList[1]._lVersion = "verify"; 59 | optionList[1]._description = "Verify results against reference implementation."; 60 | optionList[1]._type = CA_NO_ARGUMENT; 61 | optionList[1]._value = &verify; 62 | optionList[2]._sVersion = "t"; 63 | optionList[2]._lVersion = "timing"; 64 | optionList[2]._description = "Print timing."; 65 | optionList[2]._type = CA_NO_ARGUMENT; 66 | optionList[2]._value = &timing; 67 | optionList[3]._sVersion = "v"; 68 | optionList[3]._lVersion = "version"; 69 | optionList[3]._description = "AMD APP SDK version string."; 70 | optionList[3]._type = CA_NO_ARGUMENT; 71 | optionList[3]._value = &version; 72 | optionList[4]._sVersion = "d"; 73 | optionList[4]._lVersion = "deviceId"; 74 | optionList[4]._description = 75 | "Select deviceId to be used[0 to N-1 where N is number devices available]."; 76 | optionList[4]._type = CA_ARG_INT; 77 | optionList[4]._value = &deviceId; 78 | _numArgs = defaultOptions; 79 | _options = optionList; 80 | return SDK_SUCCESS; 81 | } 82 | 83 | /** 84 | *********************************************************************** 85 | * @brief Destroy the resources used by tests 86 | **********************************************************************/ 87 | virtual ~HIPCommandArgs() 88 | { 89 | } 90 | 91 | /** 92 | *********************************************************************** 93 | * @brief Constructor, initialize the resources used by tests 94 | * @param sampleName Name of the Sample 95 | **********************************************************************/ 96 | HIPCommandArgs() 97 | { 98 | deviceId = 0; 99 | enableDeviceId = false; 100 | } 101 | 102 | /** 103 | *********************************************************************** 104 | * @brief parseCommandLine parses the command line options given by user 105 | * @param argc Number of elements in cmd line input 106 | * @param argv array of char* storing the CmdLine Options 107 | * @return 0 on success Positive if expected and Non-zero on failure 108 | **********************************************************************/ 109 | int parseCommandLine(int argc, char **argv) 110 | { 111 | if(!parse(argv,argc)) 112 | { 113 | usage(); 114 | if(isArgSet("h",true) == true) 115 | { 116 | exit(SDK_SUCCESS); 117 | } 118 | return SDK_FAILURE; 119 | } 120 | if(isArgSet("h",true) == true) 121 | { 122 | usage(); 123 | exit(SDK_SUCCESS); 124 | } 125 | if(isArgSet("v", true) 126 | || isArgSet("version", false)) 127 | { 128 | std::cout << "SDK version : " << sampleVerStr.c_str() 129 | << std::endl; 130 | exit(0); 131 | } 132 | if(isArgSet("d",true) 133 | || isArgSet("deviceId",false)) 134 | { 135 | enableDeviceId = true; 136 | } 137 | return SDK_SUCCESS; 138 | } 139 | }; 140 | 141 | } 142 | #endif 143 | -------------------------------------------------------------------------------- /HIP-Examples-Applications/include/SDKBitMap.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/include/SDKBitMap.hpp -------------------------------------------------------------------------------- /HIP-Examples-Applications/include/SDKFile.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/include/SDKFile.hpp -------------------------------------------------------------------------------- /HIP-Examples-Applications/include/SDKThread.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/include/SDKThread.hpp -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HIP-Examples 2 | 3 | ## Deprecation Notice 4 | Please note that AMD will deprecate and archive the `hip-examples` repository. Please visit [rocm-examples](https://github.com/ROCm/rocm-examples), the new home for ROCm examples. 5 | 6 | ## Examples for HIP. 7 | This depot should be extracted into the root directory of an existing HIP depot. 8 | 9 | We managed to push the following benchmarks with HIP upstreamed on github: 10 | 11 | * mixbench: 12 | * GPU-Stream: 13 | 14 | mixbench and GPU-Stream have been added as submodules for this repository, to fetch data for submodules: 15 | 16 | ```bash 17 | git submodule init 18 | git submodule update 19 | ``` 20 | -------------------------------------------------------------------------------- /add4/LICENSE: -------------------------------------------------------------------------------- 1 | *============================================================================== 2 | *------------------------------------------------------------------------------ 3 | * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC 4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs 5 | *------------------------------------------------------------------------------ 6 | * License: 7 | * 1. You are free to use this program and/or to redistribute 8 | * this program. 9 | * 2. You are free to modify this program for your own use, 10 | * including commercial use, subject to the publication 11 | * restrictions in item 3. 12 | * 3. You are free to publish results obtained from running this 13 | * program, or from works that you derive from this program, 14 | * with the following limitations: 15 | * 3a. In order to be referred to as "GPU-STREAM benchmark results", 16 | * published results must be in conformance to the GPU-STREAM 17 | * Run Rules published at 18 | * http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules 19 | * and incorporated herein by reference. 20 | * The copyright holders retain the 21 | * right to determine conformity with the Run Rules. 22 | * 3b. Results based on modified source code or on runs not in 23 | * accordance with the GPU-STREAM Run Rules must be clearly 24 | * labelled whenever they are published. Examples of 25 | * proper labelling include: 26 | * "tuned GPU-STREAM benchmark results" 27 | * "based on a variant of the GPU-STREAM benchmark code" 28 | * Other comparable, clear and reasonable labelling is 29 | * acceptable. 30 | * 3c. Submission of results to the GPU-STREAM benchmark web site 31 | * is encouraged, but not required. 32 | * 4. Use of this program or creation of derived works based on this 33 | * program constitutes acceptance of these licensing restrictions. 34 | * 5. Absolutely no warranty is expressed or implied. 35 | *———————————————————————————————————------------------------------------------- 36 | -------------------------------------------------------------------------------- /add4/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -std=c++11 -O3 2 | 3 | all: gpu-stream-hip 4 | 5 | common.o: common.cpp common.h Makefile 6 | 7 | HIP_PATH?= $(wildcard /opt/rocm) 8 | 9 | HIPCC=$(HIP_PATH)/bin/hipcc 10 | 11 | hip-stream.o : hip-stream.cpp 12 | $(HIPCC) $(CXXFLAGS) -c $< -o $@ 13 | 14 | gpu-stream-hip: hip-stream.o common.o Makefile 15 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) 16 | $(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@ 17 | else 18 | $(error "Cannot find $(HIPCC), please install HIP toolkit") 19 | endif 20 | 21 | 22 | .PHONY: clean 23 | 24 | clean: 25 | rm -f gpu-stream-hip *.o 26 | 27 | -------------------------------------------------------------------------------- /add4/README.md: -------------------------------------------------------------------------------- 1 | Add4 2 | ========= 3 | This benchmark is derived from the GPU-STREAM benchmark. 4 | To increase the portion of read in the benchmark "add" kernel, we increase the number of array for "add" from two to four. 5 | After modification, we could achieve 90% efficiency for FIJI Nano GPU. 6 | 7 | 8 | GPU-STREAM 9 | ========== 10 | 11 | Measure memory transfer rates to/from global device memory on GPUs. 12 | This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for CPUs. 13 | 14 | Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time. 15 | 16 | Usage 17 | ----- 18 | 19 | Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5) 20 | 21 | Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda` 22 | 23 | For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: 24 | http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ 25 | Install the HCC compiler: 26 | https://bitbucket.org/multicoreware/hcc/wiki/Home 27 | Install HIP: 28 | https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP 29 | 30 | Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' 31 | 32 | Android 33 | ------- 34 | 35 | Assuming you have a recent Android NDK available, you can use the 36 | toolchain that it provides to build GPU-STREAM. You should first 37 | use the NDK to generate a standalone toolchain: 38 | 39 | # Select a directory to install the toolchain to 40 | ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain 41 | 42 | ${NDK}/build/tools/make-standalone-toolchain.sh \ 43 | --platform=android-14 \ 44 | --toolchain=arm-linux-androideabi-4.8 \ 45 | --install-dir=${ANDROID_NATIVE_TOOLCHAIN} 46 | 47 | Make sure that the OpenCL headers and library (libOpenCL.so) are 48 | available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`. 49 | 50 | You should then be able to build GPU-STREAM: 51 | 52 | make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++ 53 | 54 | Copy the executable and OpenCL kernels to the device: 55 | 56 | adb push gpu-stream-ocl /data/local/tmp 57 | adb push ocl-stream-kernels.cl /data/local/tmp 58 | 59 | Run GPU-STREAM from an adb shell: 60 | 61 | adb shell 62 | cd /data/local/tmp 63 | 64 | # Use float if device doesn't support double, and reduce array size 65 | ./gpu-stream-ocl --float -n 6 -s 10000000 66 | 67 | Results 68 | ------- 69 | 70 | Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request. 71 | 72 | [1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995. 73 | -------------------------------------------------------------------------------- /add4/buildit.sh: -------------------------------------------------------------------------------- 1 | make clean 2 | make gpu-stream-hip 3 | -------------------------------------------------------------------------------- /add4/common.cpp: -------------------------------------------------------------------------------- 1 | /*============================================================================= 2 | *------------------------------------------------------------------------------ 3 | * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC 4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs 5 | *------------------------------------------------------------------------------ 6 | * License: 7 | * 1. You are free to use this program and/or to redistribute 8 | * this program. 9 | * 2. You are free to modify this program for your own use, 10 | * including commercial use, subject to the publication 11 | * restrictions in item 3. 12 | * 3. You are free to publish results obtained from running this 13 | * program, or from works that you derive from this program, 14 | * with the following limitations: 15 | * 3a. In order to be referred to as "GPU-STREAM benchmark results", 16 | * published results must be in conformance to the GPU-STREAM 17 | * Run Rules published at 18 | * http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules 19 | * and incorporated herein by reference. 20 | * The copyright holders retain the 21 | * right to determine conformity with the Run Rules. 22 | * 3b. Results based on modified source code or on runs not in 23 | * accordance with the GPU-STREAM Run Rules must be clearly 24 | * labelled whenever they are published. Examples of 25 | * proper labelling include: 26 | * "tuned GPU-STREAM benchmark results" 27 | * "based on a variant of the GPU-STREAM benchmark code" 28 | * Other comparable, clear and reasonable labelling is 29 | * acceptable. 30 | * 3c. Submission of results to the GPU-STREAM benchmark web site 31 | * is encouraged, but not required. 32 | * 4. Use of this program or creation of derived works based on this 33 | * program constitutes acceptance of these licensing restrictions. 34 | * 5. Absolutely no warranty is expressed or implied. 35 | *———————————————————————————————————-----------------------------------------*/ 36 | 37 | #include "common.h" 38 | 39 | // Default array size 50 * 2^20 (50*8 Mebibytes double precision) 40 | // Use binary powers of two so divides 1024 41 | //unsigned int ARRAY_SIZE = 52428800; 42 | unsigned int ARRAY_SIZE = 26214400; 43 | size_t ARRAY_PAD_BYTES = 0; 44 | 45 | unsigned int NTIMES = 10; 46 | 47 | bool useFloat = false; 48 | unsigned int groups = 0; 49 | unsigned int groupSize = 1024; 50 | 51 | unsigned int deviceIndex = 0; 52 | 53 | int parseUInt(const char *str, unsigned int *output) 54 | { 55 | char *next; 56 | *output = strtoul(str, &next, 10); 57 | return !strlen(next); 58 | } 59 | 60 | int parseSize(const char *str, size_t *output) 61 | { 62 | char *next; 63 | *output = strtoull(str, &next, 0); 64 | int l = strlen(str); 65 | if (l) { 66 | char c = str[l-1]; // last char. 67 | if ((c == 'k') || (c == 'K')) { 68 | *output *= 1024; 69 | } 70 | if ((c == 'm') || (c == 'M')) { 71 | *output *= (1024*1024); 72 | } 73 | 74 | } 75 | return !strlen(next); 76 | } 77 | 78 | 79 | void parseArguments(int argc, char *argv[]) 80 | { 81 | for (int i = 1; i < argc; i++) 82 | { 83 | if (!strcmp(argv[i], "--list")) 84 | { 85 | listDevices(); 86 | exit(0); 87 | } 88 | else if (!strcmp(argv[i], "--device")) 89 | { 90 | if (++i >= argc || !parseUInt(argv[i], &deviceIndex)) 91 | { 92 | std::cout << "Invalid device index" << std::endl; 93 | exit(1); 94 | } 95 | } 96 | else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s")) 97 | { 98 | if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE)) 99 | { 100 | std::cout << "Invalid array size" << std::endl; 101 | exit(1); 102 | } 103 | } 104 | else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n")) 105 | { 106 | if (++i >= argc || !parseUInt(argv[i], &NTIMES)) 107 | { 108 | std::cout << "Invalid number of times" << std::endl; 109 | exit(1); 110 | } 111 | } 112 | else if (!strcmp(argv[i], "--groups")) 113 | { 114 | if (++i >= argc || !parseUInt(argv[i], &groups)) 115 | { 116 | std::cout << "Invalid group number" << std::endl; 117 | exit(1); 118 | } 119 | } 120 | else if (!strcmp(argv[i], "--groupSize")) 121 | { 122 | if (++i >= argc || !parseUInt(argv[i], &groupSize)) 123 | { 124 | std::cout << "Invalid group size" << std::endl; 125 | exit(1); 126 | } 127 | } 128 | else if (!strcmp(argv[i], "--pad")) 129 | { 130 | if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES)) 131 | { 132 | std::cout << "Invalid size" << std::endl; 133 | exit(1); 134 | } 135 | 136 | } 137 | else if (!strcmp(argv[i], "--float")) 138 | { 139 | useFloat = true; 140 | std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; 141 | } 142 | else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) 143 | { 144 | std::cout << std::endl; 145 | std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl; 146 | std::cout << "Options:" << std::endl; 147 | std::cout << " -h --help Print the message" << std::endl; 148 | std::cout << " --list List available devices" << std::endl; 149 | std::cout << " --device INDEX Select device at INDEX" << std::endl; 150 | std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; 151 | std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; 152 | std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl; 153 | std::cout << " --groupSize Set size of each group (default 1024)" << std::endl; 154 | std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl; 155 | std::cout << " --float Use floats (rather than doubles)" << std::endl; 156 | std::cout << std::endl; 157 | exit(0); 158 | } 159 | else 160 | { 161 | std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')" 162 | << std::endl; 163 | exit(1); 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /add4/common.h: -------------------------------------------------------------------------------- 1 | /*============================================================================= 2 | *------------------------------------------------------------------------------ 3 | * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC 4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs 5 | *------------------------------------------------------------------------------ 6 | * License: 7 | * 1. You are free to use this program and/or to redistribute 8 | * this program. 9 | * 2. You are free to modify this program for your own use, 10 | * including commercial use, subject to the publication 11 | * restrictions in item 3. 12 | * 3. You are free to publish results obtained from running this 13 | * program, or from works that you derive from this program, 14 | * with the following limitations: 15 | * 3a. In order to be referred to as "GPU-STREAM benchmark results", 16 | * published results must be in conformance to the GPU-STREAM 17 | * Run Rules published at 18 | * http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules 19 | * and incorporated herein by reference. 20 | * The copyright holders retain the 21 | * right to determine conformity with the Run Rules. 22 | * 3b. Results based on modified source code or on runs not in 23 | * accordance with the GPU-STREAM Run Rules must be clearly 24 | * labelled whenever they are published. Examples of 25 | * proper labelling include: 26 | * "tuned GPU-STREAM benchmark results" 27 | * "based on a variant of the GPU-STREAM benchmark code" 28 | * Other comparable, clear and reasonable labelling is 29 | * acceptable. 30 | * 3c. Submission of results to the GPU-STREAM benchmark web site 31 | * is encouraged, but not required. 32 | * 4. Use of this program or creation of derived works based on this 33 | * program constitutes acceptance of these licensing restrictions. 34 | * 5. Absolutely no warranty is expressed or implied. 35 | *———————————————————————————————————-----------------------------------------*/ 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #define VERSION_STRING "1.0" 45 | 46 | extern void parseArguments(int argc, char *argv[]); 47 | 48 | extern void listDevices(void); 49 | 50 | extern unsigned int ARRAY_SIZE; 51 | extern size_t ARRAY_PAD_BYTES; 52 | extern unsigned int NTIMES; 53 | 54 | extern unsigned int groups; 55 | extern unsigned int groupSize; 56 | extern bool useFloat; 57 | 58 | extern unsigned int deviceIndex; 59 | 60 | 61 | template < typename T > 62 | void check_solution(void* a_in, void* b_in, void* c_in) 63 | { 64 | // Generate correct solution 65 | T golda = 1.0; 66 | T goldb = 2.0; 67 | T goldc = 0.0; 68 | T goldd = 1.0; 69 | T golde = 1.0; 70 | T * a = static_cast(a_in); 71 | T * b = static_cast(b_in); 72 | T * c = static_cast(c_in); 73 | 74 | const T scalar = 3.0; 75 | 76 | for (unsigned int i = 0; i < NTIMES; i++) 77 | { 78 | // Double 79 | goldc = golda; 80 | goldb = scalar * goldc; 81 | goldc = golda + goldb + goldd + golde; 82 | golda = goldb + scalar * goldc; 83 | } 84 | 85 | // Calculate average error 86 | double erra = 0.0; 87 | double errb = 0.0; 88 | double errc = 0.0; 89 | 90 | for (unsigned int i = 0; i < ARRAY_SIZE; i++) 91 | { 92 | erra += fabs(a[i] - golda); 93 | errb += fabs(b[i] - goldb); 94 | errc += fabs(c[i] - goldc); 95 | } 96 | 97 | erra /= ARRAY_SIZE; 98 | errb /= ARRAY_SIZE; 99 | errc /= ARRAY_SIZE; 100 | 101 | double epsi = std::numeric_limits::epsilon() * 100; 102 | 103 | if (erra > epsi) 104 | std::cout 105 | << "Validation failed on a[]. Average error " << erra 106 | << std::endl; 107 | if (errb > epsi) 108 | std::cout 109 | << "Validation failed on b[]. Average error " << errb 110 | << std::endl; 111 | if (errc > epsi) 112 | std::cout 113 | << "Validation failed on c[]. Average error " << errc 114 | << std::endl; 115 | } 116 | 117 | -------------------------------------------------------------------------------- /add4/run_sweep.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #@groups=(0, 64, 128, 192, 256, 384, 512); 4 | @groups=(64, 128, 192, 256, 384, 512); 5 | @groupSize=(64, 128, 192, 256, 384, 512, 1024); 6 | 7 | 8 | foreach $g (@groups) { 9 | foreach $gs (@groupSize) { 10 | $f = "hipstream.float.$g.$gs"; 11 | $cmd = "./gpu-stream-hip --float --groups $g --groupSize $gs"; 12 | print "Run $f : $cmd\n"; 13 | 14 | system "$cmd > $f"; 15 | 16 | } 17 | } 18 | 19 | foreach $g (@groups) { 20 | foreach $gs (@groupSize) { 21 | $f = "hipstream.double.$g.$gs"; 22 | $cmd = "./gpu-stream-hip --groups $g --groupSize $gs"; 23 | print "Run $f : $cmd\n"; 24 | 25 | system "$cmd > $f"; 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /add4/runhip.sh: -------------------------------------------------------------------------------- 1 | echo ./gpu-stream-hip 2 | ./gpu-stream-hip 3 | echo ./gpu-stream-hip --groups 256 --groupSize 256 4 | ./gpu-stream-hip --groups 256 --groupSize 256 5 | echo ./gpu-stream-hip --float 6 | ./gpu-stream-hip --float 7 | echo ./gpu-stream-hip --float --groups 256 --groupSize 256 8 | ./gpu-stream-hip --float --groups 256 --groupSize 256 9 | -------------------------------------------------------------------------------- /common/hip.all.make: -------------------------------------------------------------------------------- 1 | include $(HIP_PATH)/examples/common/hip.prologue.make 2 | include $(HIP_PATH)/examples/common/hip.epilogue.make 3 | -------------------------------------------------------------------------------- /common/hip.prologue.make: -------------------------------------------------------------------------------- 1 | # This file is designed to be included at beginning of Makefile, right after setting HIP_PATH. 2 | # Note: define $HIP_PATH before including this file. 3 | # HIP_PATH should be relevant to the parent makefile 4 | # 5 | # It should not include any concrete makefile steps, so "make" still runs the first step in the Makefile. 6 | # 7 | 8 | #------ 9 | ##Provide default if not already set: 10 | HIP_PATH?=../.. 11 | HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) 12 | 13 | # CUDA toolkit installation path 14 | CUDA_DIR?=/usr/local/cuda-7.5 15 | # CUDA toolkit libraries 16 | CUDA_LIB_DIR := $(CUDA_DIR)/lib 17 | ifeq ($(shell uname -m), x86_64) 18 | ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T) 19 | CUDA_LIB_DIR := $(CUDA_DIR)/lib64 20 | endif 21 | endif 22 | 23 | # Some samples mix openmp with gpu acceleration. 24 | # Those unfortunately have to be compiled with gcc, not clang. 25 | # nvcc (7.5) can handle openmp though. 26 | # use OMPCC and OMP_FLAGS 27 | 28 | HIPCC=$(HIP_PATH)/bin/hipcc 29 | HIPLD=$(HIP_PATH)/bin/hipcc 30 | 31 | #-- 32 | # Set up automatic make of HIP cpp depenendencies 33 | # TODO - this can be removed when HIP has a proper make structure. 34 | #HIP_SOURCES = $(HIP_PATH)/src/hip_hcc.cpp 35 | 36 | HIPCC_FLAGS += -I../../common 37 | # 'make dbg=1' enables HIPCC debugging and no opt switch. 38 | ifeq ($(dbg),1) 39 | HIPCC_FLAGS += -g 40 | OMP_FLAGS += -g 41 | else ifeq ($(opt),0) 42 | HIPCC_FLAGS += -O0 43 | OMP_FLAGS += -O0 44 | else ifeq ($(opt),3) 45 | HIPCC_FLAGS += -O3 46 | OMP_FLAGS += -O3 47 | else 48 | HIPCC_FLAGS += -O2 49 | OMP_FLAGS += -O2 50 | endif 51 | 52 | ifeq ($(HIP_PLATFORM), nvcc) 53 | OMPCC = gcc 54 | OMP_FLAGS = $(HIPCC_FLAGS) 55 | HIP_DEPS = 56 | 57 | else ifeq ($(HIP_PLATFORM), hcc) 58 | #HIP_DEPS = $(HIP_SOURCES:.cpp=.o) 59 | OMPCC = gcc 60 | OMP_FLAGS += -fopenmp 61 | 62 | # Add dependencies to make hip_cc.o and other support files. 63 | HSA_PATH ?= /opt/hsa 64 | #HIP_SOURCES = $(HIP_PATH)/src/hip_hcc.cpp 65 | #HIP_DEPS = $(HIP_SOURCES:.cpp=.o) 66 | #$(HIP_DEPS): HIPCC_FLAGS += -I$(HSA_PATH)/include 67 | %.o:: %.cpp 68 | $(HIPCC) $(HIPCC_FLAGS) $< -c -o $@ 69 | endif 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | #------ 79 | # 80 | #--- 81 | # Rule for automatic HIPIFY call - assumes original cuda files are stored in local 'cusrc' directory. See kmeans. 82 | #%.cu : cusrc/%.cu 83 | # $(HIPIFY) $< > $@ 84 | #%.cuh : cusrc/%.cuh 85 | # $(HIPIFY) $< > $@ 86 | 87 | 88 | KCFLAGS += $(OPT) -I$(HSA_PATH)/include -I$(HIP_PATH)/include -I$(GRID_LAUNCH_PATH) -I$(AM_PATH)/include 89 | 90 | %.o:: %.cpp 91 | $(HIPCC) $(HIPCC_FLAGS) $< -c -o $@ 92 | 93 | 94 | -------------------------------------------------------------------------------- /cuda-stream/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | 3 | HIPCC=$(HIP_PATH)/bin/hipcc 4 | 5 | CXXFLAGS += -std=c++11 -O3 6 | 7 | stream: stream.cpp 8 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) 9 | ${HIPCC} ${CXXFLAGS} -o $@ $^ 10 | else 11 | $(error "Cannot find $(HIPCC), please install HIP toolkit") 12 | endif 13 | 14 | .PHONY: clean 15 | 16 | clean: 17 | rm -f stream *.o 18 | 19 | -------------------------------------------------------------------------------- /cuda-stream/Makefile.titan: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | ARCH=sm_35 3 | 4 | stream : stream.cpp 5 | hipcc -std=c++11 -ccbin=$(CC) stream.cpp -arch=$(ARCH) -o stream 6 | 7 | 8 | clean : 9 | rm -f stream 10 | -------------------------------------------------------------------------------- /cuda-stream/README.md: -------------------------------------------------------------------------------- 1 | The benchmark is modified from STREAM benchmark implementation with the following kernels: 2 | COPY: a(i) = b(i) 3 | SCALE: a(i) = q*b(i) 4 | SUM: a(i) = b(i) + c(i) 5 | TRIAD: a(i) = b(i) + q*c(i) 6 | 7 | To compile HIP version: 8 | make 9 | To execute: 10 | ./stream 11 | 12 | To compile on NV node, use Makefile.titan. 13 | -------------------------------------------------------------------------------- /cuda-stream/stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | STREAM benchmark implementation in CUDA. 3 | 4 | COPY: a(i) = b(i) 5 | SCALE: a(i) = q*b(i) 6 | SUM: a(i) = b(i) + c(i) 7 | TRIAD: a(i) = b(i) + q*c(i) 8 | 9 | It measures the memory system on the device. 10 | The implementation is in single precision. 11 | 12 | Code based on the code developed by John D. McCalpin 13 | http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 | 15 | Written by: Massimiliano Fatica, NVIDIA Corporation 16 | 17 | Further modifications by: Ben Cumming, CSCS 18 | 19 | Ported to HIP by: Peng Sun, AMD 20 | */ 21 | 22 | #include "hip/hip_runtime.h" 23 | #define NTIMES 20 24 | 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | # ifndef MIN 37 | # define MIN(x,y) ((x)<(y)?(x):(y)) 38 | # endif 39 | # ifndef MAX 40 | # define MAX(x,y) ((x)>(y)?(x):(y)) 41 | # endif 42 | 43 | typedef double real; 44 | 45 | static double avgtime[4] = {0}, maxtime[4] = {0}, 46 | mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 47 | 48 | 49 | void print_help() 50 | { 51 | printf( 52 | "Usage: stream [-s] [-n ] [-b ]\n\n" 53 | " -s\n" 54 | " Print results in SI units (by default IEC units are used)\n\n" 55 | " -n \n" 56 | " Put values in the arrays\n" 57 | " (defaults to 1<<26)\n\n" 58 | " -b \n" 59 | " Use as the number of threads in each block\n" 60 | " (defaults to 192)\n" 61 | ); 62 | } 63 | 64 | void parse_options(int argc, char** argv, bool& SI, int& N, int& blockSize) 65 | { 66 | // Default values 67 | SI = false; 68 | N = 1<<26; 69 | blockSize = 192; 70 | 71 | int c; 72 | 73 | while ((c = getopt (argc, argv, "sn:b:h")) != -1) 74 | switch (c) 75 | { 76 | case 's': 77 | SI = true; 78 | break; 79 | case 'n': 80 | N = std::atoi(optarg); 81 | break; 82 | case 'b': 83 | blockSize = std::atoi(optarg); 84 | break; 85 | case 'h': 86 | print_help(); 87 | std::exit(0); 88 | break; 89 | default: 90 | print_help(); 91 | std::exit(1); 92 | } 93 | } 94 | 95 | /* A gettimeofday routine to give access to the wall 96 | clock timer on most UNIX-like systems. */ 97 | 98 | 99 | double mysecond() 100 | { 101 | struct timeval tp; 102 | struct timezone tzp; 103 | int i = gettimeofday(&tp,&tzp); 104 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 105 | } 106 | 107 | 108 | template 109 | __global__ void set_array(T *a, T value, int len) 110 | { 111 | int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; 112 | if (idx < len) 113 | a[idx] = value; 114 | } 115 | 116 | template 117 | __global__ void STREAM_Copy(T *a, T *b, int len) 118 | { 119 | int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; 120 | if (idx < len) 121 | b[idx] = a[idx]; 122 | } 123 | 124 | template 125 | __global__ void STREAM_Scale(T *a, T *b, T scale, int len) 126 | { 127 | int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; 128 | if (idx < len) 129 | b[idx] = scale* a[idx]; 130 | } 131 | 132 | template 133 | __global__ void STREAM_Add(T *a, T *b, T *c, int len) 134 | { 135 | int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; 136 | if (idx < len) 137 | c[idx] = a[idx]+b[idx]; 138 | } 139 | 140 | template 141 | __global__ void STREAM_Triad(T *a, T *b, T *c, T scalar, int len) 142 | { 143 | int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; 144 | if (idx < len) 145 | c[idx] = a[idx]+scalar*b[idx]; 146 | } 147 | 148 | int main(int argc, char** argv) 149 | { 150 | real *d_a, *d_b, *d_c; 151 | int j,k; 152 | double times[4][NTIMES]; 153 | real scalar; 154 | std::vector label{"Copy: ", "Scale: ", "Add: ", "Triad: "}; 155 | 156 | // Parse arguments 157 | bool SI; 158 | int N, blockSize; 159 | parse_options(argc, argv, SI, N, blockSize); 160 | 161 | printf(" STREAM Benchmark implementation in HIP\n"); 162 | printf(" Array size (%s precision) =%7.2f MB\n", sizeof(double)==sizeof(real)?"double":"single", double(N)*double(sizeof(real))/1.e6); 163 | 164 | /* Allocate memory on device */ 165 | hipMalloc((void**)&d_a, sizeof(real)*N); 166 | hipMalloc((void**)&d_b, sizeof(real)*N); 167 | hipMalloc((void**)&d_c, sizeof(real)*N); 168 | 169 | /* Compute execution configuration */ 170 | dim3 dimBlock(blockSize); 171 | dim3 dimGrid(N/dimBlock.x ); 172 | if( N % dimBlock.x != 0 ) dimGrid.x+=1; 173 | 174 | printf(" using %d threads per block, %d blocks\n",dimBlock.x,dimGrid.x); 175 | 176 | if (SI) 177 | printf(" output in SI units (KB = 1000 B)\n"); 178 | else 179 | printf(" output in IEC units (KiB = 1024 B)\n"); 180 | 181 | /* Initialize memory on the device */ 182 | hipLaunchKernelGGL(set_array, dim3(dimGrid), dim3(dimBlock), 0, 0, d_a, 2.f, N); 183 | hipLaunchKernelGGL(set_array, dim3(dimGrid), dim3(dimBlock), 0, 0, d_b, .5f, N); 184 | hipLaunchKernelGGL(set_array, dim3(dimGrid), dim3(dimBlock), 0, 0, d_c, .5f, N); 185 | 186 | /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 187 | 188 | scalar=3.0f; 189 | for (k=0; k, dim3(dimGrid), dim3(dimBlock), 0, 0, d_a, d_c, N); 193 | hipDeviceSynchronize(); 194 | times[0][k]= mysecond() - times[0][k]; 195 | 196 | times[1][k]= mysecond(); 197 | hipLaunchKernelGGL(STREAM_Scale, dim3(dimGrid), dim3(dimBlock), 0, 0, d_b, d_c, scalar, N); 198 | hipDeviceSynchronize(); 199 | times[1][k]= mysecond() - times[1][k]; 200 | 201 | times[2][k]= mysecond(); 202 | hipLaunchKernelGGL(STREAM_Add, dim3(dimGrid), dim3(dimBlock), 0, 0, d_a, d_b, d_c, N); 203 | hipDeviceSynchronize(); 204 | times[2][k]= mysecond() - times[2][k]; 205 | 206 | times[3][k]= mysecond(); 207 | hipLaunchKernelGGL(STREAM_Triad, dim3(dimGrid), dim3(dimBlock), 0, 0, d_b, d_c, d_a, scalar, N); 208 | hipDeviceSynchronize(); 209 | times[3][k]= mysecond() - times[3][k]; 210 | } 211 | 212 | /* --- SUMMARY --- */ 213 | 214 | for (k=1; k(1<<30); 233 | 234 | printf("\nFunction Rate %s Avg time(s) Min time(s) Max time(s)\n", 235 | SI ? "(GB/s) " : "(GiB/s)" ); 236 | printf("-----------------------------------------------------------------\n"); 237 | for (j=0; j<4; j++) { 238 | avgtime[j] = avgtime[j]/(double)(NTIMES-1); 239 | 240 | printf("%s%11.4f %11.8f %11.8f %11.8f\n", label[j].c_str(), 241 | bytes[j]/mintime[j] / G, 242 | avgtime[j], 243 | mintime[j], 244 | maxtime[j]); 245 | } 246 | 247 | 248 | /* Free memory on device */ 249 | hipFree(d_a); 250 | hipFree(d_b); 251 | hipFree(d_c); 252 | } 253 | 254 | -------------------------------------------------------------------------------- /gpu-burn/AmdGpuMonitor.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #include 8 | #include 9 | #include "AmdGpuMonitor.h" 10 | 11 | // --------------------------------------------------------------------------- 12 | namespace gpuburn { 13 | 14 | AmdGpuMonitor::AmdGpuMonitor(int id, std::string hwmon) 15 | : GpuMonitor(id), mHwmonPath(hwmon) 16 | { 17 | } 18 | 19 | AmdGpuMonitor::~AmdGpuMonitor() 20 | { 21 | } 22 | 23 | // --------------------------------------------------------------------------- 24 | 25 | float AmdGpuMonitor::getTemperature() 26 | { 27 | float gpuTemp = -1; 28 | 29 | std::ifstream tempFile((mHwmonPath + "/temp1_input").c_str()); 30 | if (tempFile.is_open()) { 31 | tempFile >> gpuTemp; 32 | tempFile.close(); 33 | } 34 | 35 | // Hwmon exposes temperatures in milliCelcius 36 | return gpuTemp / 1000.0f; 37 | } 38 | 39 | }; //namespace gpuburn 40 | -------------------------------------------------------------------------------- /gpu-burn/AmdGpuMonitor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #ifndef GPUBURN_AMDGPUMONITOR_H_ 8 | #define GPUBURN_AMDGPUMONITOR_H_ 9 | 10 | #include 11 | #include "GpuMonitor.h" 12 | 13 | // --------------------------------------------------------------------------- 14 | namespace gpuburn { 15 | 16 | class AmdGpuMonitor : public GpuMonitor { 17 | public: 18 | /** 19 | * Initialize an AmdGpuMonitor instance 20 | * 21 | * @hwmonPath is the kernel hwmon resource associated to this GPU 22 | */ 23 | AmdGpuMonitor(int id, std::string hwmonPath); 24 | virtual ~AmdGpuMonitor(); 25 | 26 | virtual float getTemperature(); 27 | 28 | private: 29 | std::string mHwmonPath; 30 | }; 31 | 32 | }; // namespace gpuburn 33 | 34 | // --------------------------------------------------------------------------- 35 | 36 | #endif // GPUBURN_AMDGPUMONITOR_H_ 37 | -------------------------------------------------------------------------------- /gpu-burn/BurnKernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #include 8 | #include 9 | #include "hip/hip_runtime.h" 10 | 11 | #include "common.h" 12 | #include "BurnKernel.h" 13 | 14 | // --------------------------------------------------------------------------- 15 | namespace gpuburn { 16 | 17 | constexpr int BurnKernel::cRandSeed; 18 | constexpr float BurnKernel::cUseMem; 19 | constexpr uint32_t BurnKernel::cRowSize; 20 | constexpr uint32_t BurnKernel::cMatrixSize; 21 | constexpr uint32_t BurnKernel::cBlockSize; 22 | constexpr float BurnKernel::cAlpha; 23 | constexpr float BurnKernel::cBeta; 24 | 25 | BurnKernel::BurnKernel(int hipDevice) 26 | : mHipDevice(hipDevice), mRunKernel(false), 27 | mDeviceAdata(NULL), mDeviceBdata(NULL), mDeviceCdata(NULL) 28 | { 29 | } 30 | 31 | BurnKernel::~BurnKernel() 32 | { 33 | if (mBurnThread) 34 | mBurnThread->join(); 35 | 36 | if (mDeviceAdata) 37 | hipFree(mDeviceAdata); 38 | 39 | if (mDeviceBdata) 40 | hipFree(mDeviceBdata); 41 | 42 | if (mDeviceCdata) 43 | hipFree(mDeviceCdata); 44 | } 45 | 46 | // --------------------------------------------------------------------------- 47 | 48 | extern "C" __global__ void hip_sgemm_kernel(const int M, 49 | const int N, const int K, 50 | const float alpha, 51 | float *A, const int lda, float *B, 52 | const int ldb, const float beta, 53 | float *C, const int ldc) 54 | { 55 | //column major NN 56 | size_t idx_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; 57 | size_t idx_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; 58 | size_t dim_x = hipGridDim_x * hipBlockDim_x; 59 | size_t myIdx = idx_y * dim_x + idx_x; 60 | 61 | float local_c = beta * C[myIdx]; 62 | 63 | for(int k = 0; k < K; k++) { 64 | local_c += alpha * A[ idx_y + k * K] * B[ idx_x * K + k]; 65 | } 66 | 67 | C[myIdx] = local_c; 68 | } 69 | 70 | // --------------------------------------------------------------------------- 71 | 72 | int BurnKernel::Init() 73 | { 74 | int hipDevice = bindHipDevice(); 75 | 76 | std::string msg = "Init Burn Thread for device (" + std::to_string(hipDevice) + ")\n"; 77 | std::cout << msg; 78 | 79 | srand(cRandSeed); 80 | for (int i = 0; i < cMatrixSize; ++i) { 81 | mHostAdata[i] = (rand() % 1000000)/100000.0; 82 | mHostBdata[i] = (rand() % 1000000)/100000.0; 83 | } 84 | 85 | size_t freeMem = getAvailableMemory() * cUseMem; 86 | size_t matrixSizeBytes = sizeof(float)*cMatrixSize; 87 | mNumIterations = (freeMem - (matrixSizeBytes*2))/matrixSizeBytes; 88 | 89 | checkError(hipMalloc((void**)&mDeviceAdata, matrixSizeBytes), "Alloc A"); 90 | checkError(hipMalloc((void**)&mDeviceBdata, matrixSizeBytes), "Alloc B"); 91 | checkError(hipMalloc((void**)&mDeviceCdata, matrixSizeBytes*mNumIterations), "Alloc C"); 92 | 93 | checkError(hipMemcpy(mDeviceAdata, mHostAdata, matrixSizeBytes, hipMemcpyHostToDevice), "A -> device"); 94 | checkError(hipMemcpy(mDeviceBdata, mHostBdata, matrixSizeBytes, hipMemcpyHostToDevice), "B -> device"); 95 | checkError(hipMemset(mDeviceCdata, 0, matrixSizeBytes*mNumIterations), "C memset"); 96 | 97 | return 0; 98 | } 99 | 100 | int BurnKernel::startBurn() 101 | { 102 | mRunKernel = true; 103 | 104 | mBurnThread = make_unique(&BurnKernel::threadMain, this); 105 | return 0; 106 | } 107 | 108 | int BurnKernel::threadMain() 109 | { 110 | int err = 0; 111 | int hipDevice = bindHipDevice(); 112 | std::string msg = "Burn Thread using device (" + std::to_string(hipDevice) + ")\n"; 113 | std::cout << msg; 114 | 115 | while (mRunKernel) { 116 | err = runComputeKernel(); 117 | } 118 | 119 | return err; 120 | } 121 | 122 | int BurnKernel::stopBurn() 123 | { 124 | int hipDevice = bindHipDevice(); 125 | 126 | std::string msg = "Stopping burn thread on device (" + std::to_string(hipDevice) + ")\n"; 127 | std::cout << msg; 128 | 129 | mRunKernel = false; 130 | return 0; 131 | } 132 | 133 | int BurnKernel::bindHipDevice() 134 | { 135 | int hipDevice = -1; 136 | hipSetDevice(mHipDevice); 137 | hipGetDevice(&hipDevice); 138 | return hipDevice; 139 | } 140 | 141 | int BurnKernel::runComputeKernel() 142 | { 143 | int err = 0; 144 | 145 | for (int i = 0; mRunKernel && i < mNumIterations; ++i) { 146 | hipLaunchKernelGGL( 147 | /* Launch params */ 148 | hip_sgemm_kernel, 149 | dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1), 150 | dim3(cBlockSize,cBlockSize,1), 0, 0, 151 | /* Kernel params */ 152 | cRowSize, cRowSize, cRowSize, cAlpha, 153 | mDeviceAdata, cRowSize, 154 | mDeviceBdata, cRowSize, 155 | cBeta, 156 | mDeviceCdata + i*cMatrixSize, 157 | cRowSize); 158 | } 159 | checkError(hipDeviceSynchronize(), "Sync"); 160 | 161 | return err; 162 | } 163 | 164 | size_t BurnKernel::getAvailableMemory() 165 | { 166 | size_t freeMem, totalMem; 167 | checkError(hipMemGetInfo(&freeMem, &totalMem)); 168 | return freeMem; 169 | } 170 | 171 | }; //namespace gpuburn 172 | -------------------------------------------------------------------------------- /gpu-burn/BurnKernel.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #ifndef GPUBURN_BURNKERNEL_H_ 8 | #define GPUBURN_BURNKERNEL_H_ 9 | 10 | #include 11 | 12 | // --------------------------------------------------------------------------- 13 | namespace gpuburn { 14 | 15 | /** 16 | * The Gpu class abstracts interactions with the hardware 17 | */ 18 | class BurnKernel { 19 | public: 20 | BurnKernel(int hipDevice); 21 | ~BurnKernel(); 22 | 23 | int mHipDevice; 24 | 25 | int Init(); 26 | 27 | /** 28 | * Run a stress workload on mHipDevice 29 | */ 30 | int startBurn(); 31 | 32 | /** 33 | * Stop the stress workload 34 | */ 35 | int stopBurn(); 36 | 37 | private: 38 | static constexpr int cRandSeed = 10; 39 | static constexpr float cUseMem = 0.80; 40 | static constexpr uint32_t cRowSize = 512; 41 | static constexpr uint32_t cMatrixSize = cRowSize * cRowSize; 42 | static constexpr uint32_t cBlockSize = 16; 43 | static constexpr float cAlpha = 1.0f; 44 | static constexpr float cBeta = 0.0f; 45 | 46 | float mHostAdata[cMatrixSize]; 47 | float mHostBdata[cMatrixSize]; 48 | 49 | float* mDeviceAdata; 50 | float* mDeviceBdata; 51 | float* mDeviceCdata; 52 | 53 | bool mRunKernel; 54 | int mNumIterations; 55 | 56 | std::unique_ptr mBurnThread; 57 | 58 | int bindHipDevice(); 59 | int threadMain(); 60 | int runComputeKernel(); 61 | size_t getAvailableMemory(); 62 | 63 | }; 64 | 65 | }; // namespace gpuburn 66 | 67 | // --------------------------------------------------------------------------- 68 | 69 | #endif // GPUBURN_BURNKERNEL_H_ 70 | -------------------------------------------------------------------------------- /gpu-burn/GpuMonitor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #ifndef GPUBURN_GPUMONITOR_H_ 8 | #define GPUBURN_GPUMONITOR_H_ 9 | 10 | // --------------------------------------------------------------------------- 11 | namespace gpuburn { 12 | 13 | /** 14 | * The GpuMonitor provides a generic interface to access common 15 | * GPU hardware data 16 | */ 17 | class GpuMonitor { 18 | public: 19 | virtual ~GpuMonitor() {}; 20 | 21 | 22 | /** 23 | * Retreive the current temperature in degrees Celcius 24 | * for this device. 25 | */ 26 | virtual float getTemperature() = 0; 27 | 28 | /** 29 | * Retreive the current temperature in degrees Celcius 30 | * for this device. 31 | */ 32 | virtual int getId() { return mId; } 33 | 34 | protected: 35 | GpuMonitor(int id) : mId(id) {}; 36 | 37 | private: 38 | int mId; 39 | }; 40 | 41 | }; // namespace gpuburn 42 | 43 | // --------------------------------------------------------------------------- 44 | 45 | #endif // GPUBURN_GPUMONITOR_H_ 46 | -------------------------------------------------------------------------------- /gpu-burn/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | 3 | HIP_PLATFORM = $(shell $(HIP_PATH)/bin/hipconfig --platform) 4 | 5 | HIP_INCLUDE = -I${HIP_PATH}/../include 6 | 7 | BUILD_DIR ?= build 8 | 9 | HIPCC = $(HIP_PATH)/bin/hipcc 10 | CPPFLAGS = -O3 11 | LDFLAGS = -lm -lpthread 12 | 13 | ifeq (${HIP_PLATFORM}, nvcc) 14 | CPPFLAGS += -arch=compute_20 15 | endif 16 | 17 | GPUBURN_SRC = $(wildcard *.cpp) 18 | GPUBURN_OBJ = $(addprefix ${BUILD_DIR}/,$(subst .cpp,.o, $(GPUBURN_SRC))) 19 | GPUBURN_BIN = ${BUILD_DIR}/gpuburn-hip 20 | 21 | .PHONY: all clean run itburn 22 | 23 | all: ${GPUBURN_BIN} 24 | 25 | ${GPUBURN_BIN}: ${GPUBURN_OBJ} 26 | ${HIPCC} ${LDFLAGS} -o ${GPUBURN_BIN} ${GPUBURN_OBJ} 27 | 28 | ${BUILD_DIR}/%.o: %.cpp Makefile 29 | mkdir -p ${BUILD_DIR} 30 | ${HIPCC} ${HIP_INCLUDE} ${CPPFLAGS} -c -o $@ $< 31 | 32 | run: itburn 33 | itburn: 34 | HCC_LAZYINIT=ON ${GPUBURN_BIN} 35 | 36 | clean: 37 | rm -rf ${BUILD_DIR} 38 | -------------------------------------------------------------------------------- /gpu-burn/common.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #include "hip/hip_runtime.h" 8 | #include "common.h" 9 | 10 | // --------------------------------------------------------------------------- 11 | namespace gpuburn { 12 | 13 | int checkError(hipError_t err, std::string desc) 14 | { 15 | if (err == hipSuccess) 16 | return 0; 17 | 18 | std::string errStr = hipGetErrorString(err); 19 | std::string errorMessage = ""; 20 | if (desc == "") 21 | throw "Error: " + errStr + "\n"; 22 | else 23 | throw "Error in \"" + desc + "\": " + errStr + "\n"; 24 | 25 | return err; 26 | } 27 | 28 | }; // namespace common 29 | 30 | // --------------------------------------------------------------------------- 31 | -------------------------------------------------------------------------------- /gpu-burn/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | 7 | #ifndef GPUBURN_COMMON_H_ 8 | #define GPUBURN_COMMON_H_ 9 | 10 | // --------------------------------------------------------------------------- 11 | namespace gpuburn { 12 | 13 | /** 14 | * c++11 doesn't support make_unique, which is very convenient 15 | * Refer to: https://herbsutter.com/gotw/_102/ 16 | */ 17 | template 18 | std::unique_ptr make_unique(Args&&... args) 19 | { 20 | return std::unique_ptr(new T(std::forward(args)...)); 21 | } 22 | 23 | int checkError(hipError_t err, std::string desc = ""); 24 | 25 | }; // namespace common 26 | 27 | // --------------------------------------------------------------------------- 28 | 29 | #endif // GPUBURN_COMMON_H_ 30 | -------------------------------------------------------------------------------- /gpu-burn/gpuburn.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Public domain. No warranty. 3 | * Ville Timonen 2013 4 | * edited by Timmy Liu for HIP API 01/2016 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "hip/hip_runtime.h" 14 | 15 | #include "common.h" 16 | #include "AmdGpuMonitor.h" 17 | #include "BurnKernel.h" 18 | 19 | // --------------------------------------------------------------------------- 20 | using namespace gpuburn; 21 | 22 | std::vector> genBurnKernels() 23 | { 24 | int deviceCount = 0; 25 | std::vector> kernels; 26 | 27 | try { 28 | checkError(hipGetDeviceCount(&deviceCount)); 29 | std::cout<<"Total no. of GPUs found: "< kernel(new BurnKernel(i)); 37 | kernel->Init(); 38 | kernels.push_back(std::move(kernel)); 39 | } catch (std::string e) { 40 | std::cerr << e; 41 | std::cerr << "Error: failed to initialize hip device " << i << "\n"; 42 | } 43 | } 44 | 45 | return kernels; 46 | } 47 | 48 | std::vector> genGpuMonitors() 49 | { 50 | int deviceCount = 0; 51 | std::vector> monitors; 52 | 53 | for (int i = 0; true; i++) { 54 | struct stat dirInfo; 55 | std::string hwmonDir = "/sys/class/hwmon/hwmon" + std::to_string(i); 56 | 57 | if (stat(hwmonDir.c_str(), &dirInfo)) 58 | break; 59 | 60 | std::string hwmonName; 61 | std::ifstream hwmon(hwmonDir + "/name"); 62 | 63 | if (!hwmon.good()) 64 | continue; 65 | 66 | hwmon >> hwmonName; 67 | if (hwmonName == "amdgpu") { 68 | GpuMonitor* monitor = new AmdGpuMonitor(i, "/sys/class/hwmon/hwmon" + std::to_string(i)); 69 | std::unique_ptr uniq_monitor(monitor); 70 | monitors.push_back(std::move(uniq_monitor)); 71 | } 72 | } 73 | 74 | return monitors; 75 | } 76 | 77 | int doBurn(int burnSec) { 78 | std::vector> burnKernels = genBurnKernels(); 79 | std::vector> gpuMonitors = genGpuMonitors(); 80 | 81 | if (burnKernels.size() == 0) 82 | return -ENOENT; 83 | 84 | for (auto& kernel : burnKernels) { 85 | kernel->startBurn(); 86 | } 87 | 88 | for (; burnSec > 0; --burnSec) { 89 | std::ostringstream msg; 90 | msg << "Temps: "; 91 | for (auto& monitor : gpuMonitors) { 92 | msg << "[GPU" << monitor->getId() << ": " << monitor->getTemperature() << " C] "; 93 | } 94 | msg << burnSec << "s\n"; 95 | std::cout << msg.str(); 96 | sleep(1); 97 | } 98 | 99 | for (auto& kernel : burnKernels) { 100 | kernel->stopBurn(); 101 | } 102 | 103 | return 0; 104 | } 105 | 106 | int main(int argc, char **argv) { 107 | int opt; 108 | int burnSec = 10; 109 | 110 | while ((opt = getopt (argc, argv, "ht:")) != -1) 111 | switch (opt) 112 | { 113 | case 't': 114 | burnSec = atoi(optarg); 115 | break; 116 | case 'h': 117 | default: 118 | std::cerr << "Usage: " << argv[0] << " [-t sec]\n"; 119 | return -EINVAL; 120 | } 121 | 122 | return doBurn(burnSec); 123 | } 124 | 125 | 126 | // --------------------------------------------------------------------------- 127 | -------------------------------------------------------------------------------- /mini-nbody/README.md: -------------------------------------------------------------------------------- 1 | mini-nbody: A simple N-body Code 2 | ================================ 3 | 4 | A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations. 5 | 6 | Benchmarks 7 | ---------- 8 | 9 | There are 5 different benchmarks provided for CUDA and MIC platforms. 10 | 11 | 1. nbody-orig : the original, unoptimized simulation (also for CPU) 12 | 2. nbody-soa : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout 13 | 3. nbody-flush : Flush denormals to zero (no code changes, just a command line option) 14 | 4. nbody-block : Cache blocking 15 | 5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC) 16 | 17 | Files 18 | ----- 19 | 20 | nbody.c : simple, unoptimized OpenMP C code 21 | timer.h : simple cross-OS timing code 22 | 23 | Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies. 24 | 25 | cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU) 26 | 1. nbody-orig.cu : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh) 27 | 2. nbody-soa.cu : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh) 28 | 3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh) 29 | 4. nbody-block.cu : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh) 30 | 5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh) 31 | 32 | HIP/ : folder containing HIP optimized versions of the original C code (in order of performance on FIJI NANO) 33 | 1. nbody-orig.cpp : a straight port of the code to HIP (HIP-nbody-orig.sh) 34 | 2. nbody-soa.cpp : conversion to structure of arrays (SOA) data layout (HIP-nbody-soa.sh) 35 | 3. nbody-block.cu : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh) 36 | 37 | 38 | 39 | mic/ : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P) 40 | 1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh) 41 | 2. nbody-soa.c : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh) 42 | 3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh) 43 | 4. nbody-block.c : cache blocking via loop splitting (shmoo-mic-nbody-block.sh) 44 | 5. nbody-align.c : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh) 45 | 46 | -------------------------------------------------------------------------------- /mini-nbody/cuda/nbody-block.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define BLOCK_SIZE 256 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float4 *pos, *vel; } BodySystem; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | __global__ 18 | void bodyForce(float4 *p, float4 *v, float dt, int n) { 19 | int i = blockDim.x * blockIdx.x + threadIdx.x; 20 | if (i < n) { 21 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 22 | 23 | for (int tile = 0; tile < gridDim.x; tile++) { 24 | __shared__ float3 spos[BLOCK_SIZE]; 25 | float4 tpos = p[tile * blockDim.x + threadIdx.x]; 26 | spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z); 27 | __syncthreads(); 28 | 29 | for (int j = 0; j < BLOCK_SIZE; j++) { 30 | float dx = spos[j].x - p[i].x; 31 | float dy = spos[j].y - p[i].y; 32 | float dz = spos[j].z - p[i].z; 33 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 34 | float invDist = rsqrtf(distSqr); 35 | float invDist3 = invDist * invDist * invDist; 36 | 37 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 38 | } 39 | __syncthreads(); 40 | } 41 | 42 | v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; 43 | } 44 | } 45 | 46 | int main(const int argc, const char** argv) { 47 | 48 | int nBodies = 30000; 49 | if (argc > 1) nBodies = atoi(argv[1]); 50 | 51 | const float dt = 0.01f; // time step 52 | const int nIters = 10; // simulation iterations 53 | 54 | int bytes = 2*nBodies*sizeof(float4); 55 | float *buf = (float*)malloc(bytes); 56 | BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; 57 | 58 | randomizeBodies(buf, 8*nBodies); // Init pos / vel data 59 | 60 | float *d_buf; 61 | cudaMalloc(&d_buf, bytes); 62 | BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; 63 | 64 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 65 | double totalTime = 0.0; 66 | 67 | for (int iter = 1; iter <= nIters; iter++) { 68 | StartTimer(); 69 | 70 | cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); 71 | bodyForce<<>>(d_p.pos, d_p.vel, dt, nBodies); 72 | cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); 73 | 74 | for (int i = 0 ; i < nBodies; i++) { // integrate position 75 | p.pos[i].x += p.vel[i].x*dt; 76 | p.pos[i].y += p.vel[i].y*dt; 77 | p.pos[i].z += p.vel[i].z*dt; 78 | } 79 | 80 | const double tElapsed = GetTimer() / 1000.0; 81 | if (iter > 1) { // First iter is warm up 82 | totalTime += tElapsed; 83 | } 84 | #ifndef SHMOO 85 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 86 | #endif 87 | } 88 | double avgTime = totalTime / (double)(nIters-1); 89 | 90 | #ifdef SHMOO 91 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 92 | #else 93 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 94 | nIters, rate); 95 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 96 | #endif 97 | free(buf); 98 | cudaFree(d_buf); 99 | } 100 | -------------------------------------------------------------------------------- /mini-nbody/cuda/nbody-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define BLOCK_SIZE 256 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float x, y, z, vx, vy, vz; } Body; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | __global__ 18 | void bodyForce(Body *p, float dt, int n) { 19 | int i = blockDim.x * blockIdx.x + threadIdx.x; 20 | if (i < n) { 21 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 22 | 23 | for (int j = 0; j < n; j++) { 24 | float dx = p[j].x - p[i].x; 25 | float dy = p[j].y - p[i].y; 26 | float dz = p[j].z - p[i].z; 27 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 28 | float invDist = rsqrtf(distSqr); 29 | float invDist3 = invDist * invDist * invDist; 30 | 31 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 32 | } 33 | 34 | p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; 35 | } 36 | } 37 | 38 | int main(const int argc, const char** argv) { 39 | 40 | int nBodies = 30000; 41 | if (argc > 1) nBodies = atoi(argv[1]); 42 | 43 | const float dt = 0.01f; // time step 44 | const int nIters = 10; // simulation iterations 45 | 46 | int bytes = nBodies*sizeof(Body); 47 | float *buf = (float*)malloc(bytes); 48 | Body *p = (Body*)buf; 49 | 50 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 51 | 52 | float *d_buf; 53 | cudaMalloc(&d_buf, bytes); 54 | Body *d_p = (Body*)d_buf; 55 | 56 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 57 | double totalTime = 0.0; 58 | 59 | for (int iter = 1; iter <= nIters; iter++) { 60 | StartTimer(); 61 | 62 | cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); 63 | bodyForce<<>>(d_p, dt, nBodies); // compute interbody forces 64 | cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); 65 | 66 | for (int i = 0 ; i < nBodies; i++) { // integrate position 67 | p[i].x += p[i].vx*dt; 68 | p[i].y += p[i].vy*dt; 69 | p[i].z += p[i].vz*dt; 70 | } 71 | 72 | const double tElapsed = GetTimer() / 1000.0; 73 | if (iter > 1) { // First iter is warm up 74 | totalTime += tElapsed; 75 | } 76 | #ifndef SHMOO 77 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 78 | #endif 79 | } 80 | double avgTime = totalTime / (double)(nIters-1); 81 | 82 | #ifdef SHMOO 83 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 84 | #else 85 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 86 | nIters, rate); 87 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 88 | #endif 89 | free(buf); 90 | cudaFree(d_buf); 91 | } 92 | -------------------------------------------------------------------------------- /mini-nbody/cuda/nbody-soa.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define BLOCK_SIZE 256 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float4 *pos, *vel; } BodySystem; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | __global__ 18 | void bodyForce(float4 *p, float4 *v, float dt, int n) { 19 | int i = blockDim.x * blockIdx.x + threadIdx.x; 20 | if (i < n) { 21 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 22 | 23 | for (int j = 0; j < n; j++) { 24 | float dx = p[j].x - p[i].x; 25 | float dy = p[j].y - p[i].y; 26 | float dz = p[j].z - p[i].z; 27 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 28 | float invDist = rsqrtf(distSqr); 29 | float invDist3 = invDist * invDist * invDist; 30 | 31 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 32 | } 33 | 34 | v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; 35 | } 36 | } 37 | 38 | int main(const int argc, const char** argv) { 39 | 40 | int nBodies = 30000; 41 | if (argc > 1) nBodies = atoi(argv[1]); 42 | 43 | const float dt = 0.01f; // time step 44 | const int nIters = 10; // simulation iterations 45 | 46 | int bytes = 2*nBodies*sizeof(float4); 47 | float *buf = (float*)malloc(bytes); 48 | BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; 49 | 50 | randomizeBodies(buf, 8*nBodies); // Init pos / vel data 51 | 52 | float *d_buf; 53 | cudaMalloc(&d_buf, bytes); 54 | BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; 55 | 56 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 57 | double totalTime = 0.0; 58 | 59 | for (int iter = 1; iter <= nIters; iter++) { 60 | StartTimer(); 61 | 62 | cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); 63 | bodyForce<<>>(d_p.pos, d_p.vel, dt, nBodies); 64 | cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); 65 | 66 | for (int i = 0 ; i < nBodies; i++) { // integrate position 67 | p.pos[i].x += p.vel[i].x*dt; 68 | p.pos[i].y += p.vel[i].y*dt; 69 | p.pos[i].z += p.vel[i].z*dt; 70 | } 71 | 72 | const double tElapsed = GetTimer() / 1000.0; 73 | if (iter > 1) { // First iter is warm up 74 | totalTime += tElapsed; 75 | } 76 | #ifndef SHMOO 77 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 78 | #endif 79 | } 80 | double avgTime = totalTime / (double)(nIters-1); 81 | 82 | #ifdef SHMOO 83 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 84 | #else 85 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 86 | nIters, rate); 87 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 88 | #endif 89 | free(buf); 90 | cudaFree(d_buf); 91 | } 92 | -------------------------------------------------------------------------------- /mini-nbody/cuda/nbody-unroll.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define BLOCK_SIZE 256 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float4 *pos, *vel; } BodySystem; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | __global__ 18 | void bodyForce(float4 *p, float4 *v, float dt, int n) { 19 | int i = blockDim.x * blockIdx.x + threadIdx.x; 20 | if (i < n) { 21 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 22 | 23 | for (int tile = 0; tile < gridDim.x; tile++) { 24 | __shared__ float3 spos[BLOCK_SIZE]; 25 | float4 tpos = p[tile * blockDim.x + threadIdx.x]; 26 | spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z); 27 | __syncthreads(); 28 | 29 | #pragma unroll 30 | for (int j = 0; j < BLOCK_SIZE; j++) { 31 | float dx = spos[j].x - p[i].x; 32 | float dy = spos[j].y - p[i].y; 33 | float dz = spos[j].z - p[i].z; 34 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 35 | float invDist = rsqrtf(distSqr); 36 | float invDist3 = invDist * invDist * invDist; 37 | 38 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 39 | } 40 | __syncthreads(); 41 | } 42 | 43 | v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; 44 | } 45 | } 46 | 47 | int main(const int argc, const char** argv) { 48 | 49 | int nBodies = 30000; 50 | if (argc > 1) nBodies = atoi(argv[1]); 51 | 52 | const float dt = 0.01f; // time step 53 | const int nIters = 10; // simulation iterations 54 | 55 | int bytes = 2*nBodies*sizeof(float4); 56 | float *buf = (float*)malloc(bytes); 57 | BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; 58 | 59 | randomizeBodies(buf, 8*nBodies); // Init pos / vel data 60 | 61 | float *d_buf; 62 | cudaMalloc(&d_buf, bytes); 63 | BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; 64 | 65 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 66 | double totalTime = 0.0; 67 | 68 | for (int iter = 1; iter <= nIters; iter++) { 69 | StartTimer(); 70 | 71 | cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); 72 | bodyForce<<>>(d_p.pos, d_p.vel, dt, nBodies); 73 | cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); 74 | 75 | for (int i = 0 ; i < nBodies; i++) { // integrate position 76 | p.pos[i].x += p.vel[i].x*dt; 77 | p.pos[i].y += p.vel[i].y*dt; 78 | p.pos[i].z += p.vel[i].z*dt; 79 | } 80 | 81 | const double tElapsed = GetTimer() / 1000.0; 82 | if (iter > 1) { // First iter is warm up 83 | totalTime += tElapsed; 84 | } 85 | #ifndef SHMOO 86 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 87 | #endif 88 | } 89 | double avgTime = totalTime / (double)(nIters-1); 90 | 91 | #ifdef SHMOO 92 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 93 | #else 94 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 95 | nIters, rate); 96 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 97 | #endif 98 | free(buf); 99 | cudaFree(d_buf); 100 | } 101 | -------------------------------------------------------------------------------- /mini-nbody/cuda/shmoo-cuda-nbody-block.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-block.cu 2 | EXE=nbody-block 3 | 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /mini-nbody/cuda/shmoo-cuda-nbody-ftz.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.cu 2 | EXE=nbody-ftz 3 | 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /mini-nbody/cuda/shmoo-cuda-nbody-orig.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-orig.cu 2 | EXE=nbody-orig 3 | 4 | nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /mini-nbody/cuda/shmoo-cuda-nbody-soa.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.cu 2 | EXE=nbody-soa 3 | 4 | nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /mini-nbody/cuda/shmoo-cuda-nbody-unroll.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-unroll.cu 2 | EXE=nbody-unroll 3 | 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /mini-nbody/hip/HIP-nbody-block.sh: -------------------------------------------------------------------------------- 1 | #Hipify the blocked cuda source code to hip compatible code 2 | #hipify nbody-block.cu > nbody-block.cpp 3 | #Manually add the first argument onto the kernel argument list 4 | #void bodyForce(Body *p, float dt, int n) //before modification 5 | #void bodyForce(hipLaunchParm lp, Body *p, float dt, int n) //after modification 6 | 7 | #compile the hipified source code into executable 8 | if [ -f nbody-block ] 9 | then 10 | rm nbody-block 11 | fi 12 | 13 | if [ -z "$HIP_PATH" ] 14 | then 15 | 16 | if [ -d /opt/rocm/hip ] 17 | then 18 | HIP_PATH=/opt/rocm/hip 19 | else 20 | HIP_PATH=/opt/rocm 21 | fi 22 | 23 | fi 24 | 25 | echo hipcc -I../ -DSHMOO nbody-block.cpp -o nbody-block 26 | $HIP_PATH/bin/hipcc -I../ -DSHMOO nbody-block.cpp -o nbody-block 27 | 28 | #To print our more details, remove DSHMOO flag 29 | #hipcc -I../ nbody-block.cpp -o nbody-block 30 | 31 | #execute the program 32 | EXE=nbody-block 33 | K=1024 34 | for i in {1..8} 35 | do 36 | echo ./$EXE $K 37 | ./$EXE $K 38 | K=$(($K*2)) 39 | done 40 | 41 | -------------------------------------------------------------------------------- /mini-nbody/hip/HIP-nbody-orig.sh: -------------------------------------------------------------------------------- 1 | #Hipify the original cuda source code to hip compatible code 2 | #hipify nbody-orig.cu > nbody-orig.cpp 3 | 4 | #compile the hipified source code into executable 5 | if [ -f nbody-orig ] 6 | then 7 | rm nbody-orig 8 | fi 9 | 10 | if [ -z "$HIP_PATH" ] 11 | then 12 | 13 | if [ -d /opt/rocm/hip ] 14 | then 15 | HIP_PATH=/opt/rocm/hip 16 | else 17 | HIP_PATH=/opt/rocm 18 | fi 19 | 20 | fi 21 | 22 | echo hipcc -I../ -DSHMOO nbody-orig.cpp -o nbody-orig 23 | $HIP_PATH/bin/hipcc -I../ -DSHMOO nbody-orig.cpp -o nbody-orig 24 | 25 | #To print our more details, remove flag 26 | #hipcc -I../ nbody-orig.cpp -o nbody-orig 27 | 28 | #execute the program 29 | 30 | EXE=nbody-orig 31 | K=1024 32 | for i in {1..10} 33 | do 34 | echo ./$EXE $K 35 | ./$EXE $K 36 | K=$(($K*2)) 37 | done 38 | 39 | -------------------------------------------------------------------------------- /mini-nbody/hip/HIP-nbody-soa.sh: -------------------------------------------------------------------------------- 1 | #Hipify the soa cuda source code to hip compatible code 2 | #hipify nbody-soa.cu > nbody-soa.cpp 3 | #Manually add the first argument onto the kernel argument list 4 | #void bodyForce(Body *p, float dt, int n) //before modification 5 | #void bodyForce(hipLaunchParm lp, Body *p, float dt, int n) //after modification 6 | 7 | #compile the hipified source code into executable 8 | if [ -f nbody-soa ] 9 | then 10 | rm nbody-soa 11 | fi 12 | 13 | if [ -z "$HIP_PATH" ] 14 | then 15 | 16 | if [ -d /opt/rocm/hip ] 17 | then 18 | HIP_PATH=/opt/rocm/hip 19 | else 20 | HIP_PATH=/opt/rocm 21 | fi 22 | 23 | fi 24 | 25 | echo hipcc -I../ -DSHMOO nbody-soa.cpp -o nbody-soa 26 | $HIP_PATH/bin/hipcc -I../ -DSHMOO nbody-soa.cpp -o nbody-soa 27 | 28 | #To print our more details, remove DSHMOO flag 29 | #hipcc -I../ nbody-soa.cpp -o nbody-soa 30 | 31 | #execute the program 32 | EXE=nbody-soa 33 | K=1024 34 | for i in {1..8} 35 | do 36 | echo ./$EXE $K 37 | ./$EXE $K 38 | K=$(($K*2)) 39 | done 40 | 41 | -------------------------------------------------------------------------------- /mini-nbody/hip/nbody-block.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | #include 3 | #include 4 | #include 5 | #include "timer.h" 6 | 7 | #define BLOCK_SIZE 256 8 | #define SOFTENING 1e-9f 9 | 10 | typedef struct { float4 *pos, *vel; } BodySystem; 11 | 12 | void randomizeBodies(float *data, int n) { 13 | for (int i = 0; i < n; i++) { 14 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 15 | } 16 | } 17 | 18 | __global__ 19 | void bodyForce(float4 *p, float4 *v, float dt, int n) { 20 | int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; 21 | if (i < n) { 22 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 23 | 24 | for (int tile = 0; tile < hipGridDim_x; tile++) { 25 | __shared__ float3 spos[BLOCK_SIZE]; 26 | float4 tpos = p[tile * hipBlockDim_x + hipThreadIdx_x]; 27 | spos[hipThreadIdx_x] = make_float3(tpos.x, tpos.y, tpos.z); 28 | __syncthreads(); 29 | 30 | for (int j = 0; j < BLOCK_SIZE; j++) { 31 | float dx = spos[j].x - p[i].x; 32 | float dy = spos[j].y - p[i].y; 33 | float dz = spos[j].z - p[i].z; 34 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 35 | float invDist = 1.0f / sqrtf(distSqr); 36 | float invDist3 = invDist * invDist * invDist; 37 | 38 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 39 | } 40 | __syncthreads(); 41 | } 42 | 43 | v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; 44 | } 45 | } 46 | 47 | int main(const int argc, const char** argv) { 48 | 49 | int nBodies = 30000; 50 | if (argc > 1) nBodies = atoi(argv[1]); 51 | 52 | const float dt = 0.01f; // time step 53 | const int nIters = 10; // simulation iterations 54 | 55 | int bytes = 2*nBodies*sizeof(float4); 56 | float *buf = (float*)malloc(bytes); 57 | BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; 58 | 59 | randomizeBodies(buf, 8*nBodies); // Init pos / vel data 60 | 61 | float *d_buf; 62 | hipMalloc(&d_buf, bytes); 63 | BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; 64 | 65 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 66 | double totalTime = 0.0; 67 | 68 | for (int iter = 1; iter <= nIters; iter++) { 69 | StartTimer(); 70 | 71 | hipMemcpy(d_buf, buf, bytes, hipMemcpyHostToDevice); 72 | hipLaunchKernelGGL(bodyForce, dim3(nBlocks), dim3(BLOCK_SIZE), 0, 0, d_p.pos, d_p.vel, dt, nBodies); 73 | hipMemcpy(buf, d_buf, bytes, hipMemcpyDeviceToHost); 74 | 75 | for (int i = 0 ; i < nBodies; i++) { // integrate position 76 | p.pos[i].x += p.vel[i].x*dt; 77 | p.pos[i].y += p.vel[i].y*dt; 78 | p.pos[i].z += p.vel[i].z*dt; 79 | } 80 | 81 | const double tElapsed = GetTimer() / 1000.0; 82 | if (iter > 1) { // First iter is warm up 83 | totalTime += tElapsed; 84 | } 85 | #ifndef SHMOO 86 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 87 | #endif 88 | } 89 | double avgTime = totalTime / (double)(nIters-1); 90 | 91 | #ifdef SHMOO 92 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 93 | #else 94 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 95 | nIters, rate); 96 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 97 | #endif 98 | free(buf); 99 | hipFree(d_buf); 100 | } 101 | -------------------------------------------------------------------------------- /mini-nbody/hip/nbody-orig.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | #include 3 | #include 4 | #include 5 | #include "timer.h" 6 | 7 | #define BLOCK_SIZE 256 8 | #define SOFTENING 1e-9f 9 | 10 | typedef struct { float x, y, z, vx, vy, vz; } Body; 11 | 12 | void randomizeBodies(float *data, int n) { 13 | for (int i = 0; i < n; i++) { 14 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 15 | } 16 | } 17 | 18 | //inline float rsqrtf(float x){ 19 | //return 1.0f / sqrtf(x); 20 | //}//host implementation of cuda function for rsqrtf 21 | 22 | __global__ 23 | void bodyForce(Body *p, float dt, int n) { 24 | int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; 25 | if (i < n) { 26 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 27 | 28 | for (int j = 0; j < n; j++) { 29 | float dx = p[j].x - p[i].x; 30 | float dy = p[j].y - p[i].y; 31 | float dz = p[j].z - p[i].z; 32 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 33 | float invDist = 1.0f / sqrtf(distSqr); 34 | //float invDist = rsqrtf(distSqr); 35 | float invDist3 = invDist * invDist * invDist; 36 | 37 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 38 | } 39 | 40 | p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; 41 | } 42 | } 43 | 44 | int main(const int argc, const char** argv) { 45 | 46 | int nBodies = 30000; 47 | if (argc > 1) nBodies = atoi(argv[1]); 48 | 49 | const float dt = 0.01f; // time step 50 | const int nIters = 10; // simulation iterations 51 | 52 | int bytes = nBodies*sizeof(Body); 53 | float *buf = (float*)malloc(bytes); 54 | Body *p = (Body*)buf; 55 | 56 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 57 | 58 | float *d_buf; 59 | hipMalloc(&d_buf, bytes); 60 | Body *d_p = (Body*)d_buf; 61 | 62 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 63 | double totalTime = 0.0; 64 | 65 | for (int iter = 1; iter <= nIters; iter++) { 66 | StartTimer(); 67 | 68 | hipMemcpy(d_buf, buf, bytes, hipMemcpyHostToDevice); 69 | hipLaunchKernelGGL(bodyForce, dim3(nBlocks), dim3(BLOCK_SIZE), 0, 0, d_p, dt, nBodies); // compute interbody forces 70 | hipMemcpy(buf, d_buf, bytes, hipMemcpyDeviceToHost); 71 | 72 | for (int i = 0 ; i < nBodies; i++) { // integrate position 73 | p[i].x += p[i].vx*dt; 74 | p[i].y += p[i].vy*dt; 75 | p[i].z += p[i].vz*dt; 76 | } 77 | 78 | const double tElapsed = GetTimer() / 1000.0; 79 | if (iter > 1) { // First iter is warm up 80 | totalTime += tElapsed; 81 | } 82 | #ifndef SHMOO 83 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 84 | #endif 85 | } 86 | double avgTime = totalTime / (double)(nIters-1); 87 | 88 | #ifdef SHMOO 89 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 90 | #else 91 | //printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 92 | //nIters, rate); 93 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 94 | #endif 95 | free(buf); 96 | hipFree(d_buf); 97 | } 98 | -------------------------------------------------------------------------------- /mini-nbody/hip/nbody-soa.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | #include 3 | #include 4 | #include 5 | #include "timer.h" 6 | 7 | #define BLOCK_SIZE 256 8 | #define SOFTENING 1e-9f 9 | 10 | typedef struct { float4 *pos, *vel; } BodySystem; 11 | 12 | void randomizeBodies(float *data, int n) { 13 | for (int i = 0; i < n; i++) { 14 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 15 | } 16 | } 17 | 18 | __global__ 19 | void bodyForce(float4 *p, float4 *v, float dt, int n) { 20 | int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; 21 | if (i < n) { 22 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 23 | 24 | for (int j = 0; j < n; j++) { 25 | float dx = p[j].x - p[i].x; 26 | float dy = p[j].y - p[i].y; 27 | float dz = p[j].z - p[i].z; 28 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 29 | /*float invDist = rsqrtf(distSqr);*/ 30 | float invDist = 1.0f / sqrtf(distSqr); 31 | float invDist3 = invDist * invDist * invDist; 32 | 33 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 34 | } 35 | 36 | v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; 37 | } 38 | } 39 | 40 | int main(const int argc, const char** argv) { 41 | 42 | int nBodies = 30000; 43 | if (argc > 1) nBodies = atoi(argv[1]); 44 | 45 | const float dt = 0.01f; // time step 46 | const int nIters = 10; // simulation iterations 47 | 48 | int bytes = 2*nBodies*sizeof(float4); 49 | float *buf = (float*)malloc(bytes); 50 | BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; 51 | 52 | randomizeBodies(buf, 8*nBodies); // Init pos / vel data 53 | 54 | float *d_buf; 55 | hipMalloc(&d_buf, bytes); 56 | BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; 57 | 58 | int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; 59 | double totalTime = 0.0; 60 | 61 | for (int iter = 1; iter <= nIters; iter++) { 62 | StartTimer(); 63 | 64 | hipMemcpy(d_buf, buf, bytes, hipMemcpyHostToDevice); 65 | hipLaunchKernelGGL(bodyForce, dim3(nBlocks), dim3(BLOCK_SIZE), 0, 0, d_p.pos, d_p.vel, dt, nBodies); 66 | hipMemcpy(buf, d_buf, bytes, hipMemcpyDeviceToHost); 67 | 68 | for (int i = 0 ; i < nBodies; i++) { // integrate position 69 | p.pos[i].x += p.vel[i].x*dt; 70 | p.pos[i].y += p.vel[i].y*dt; 71 | p.pos[i].z += p.vel[i].z*dt; 72 | } 73 | 74 | const double tElapsed = GetTimer() / 1000.0; 75 | if (iter > 1) { // First iter is warm up 76 | totalTime += tElapsed; 77 | } 78 | #ifndef SHMOO 79 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 80 | #endif 81 | } 82 | double avgTime = totalTime / (double)(nIters-1); 83 | 84 | #ifdef SHMOO 85 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 86 | #else 87 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 88 | nIters, rate); 89 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 90 | #endif 91 | free(buf); 92 | hipFree(d_buf); 93 | } 94 | -------------------------------------------------------------------------------- /mini-nbody/mic/nbody-align.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define CACHELINE 64 // size of cache line [bytes] 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | 18 | void bodyForce(BodySystem p, float dt, int n, int tileSize) { 19 | for (int tile = 0; tile < n; tile += tileSize) { 20 | int to = tile + tileSize; 21 | if (to > n) to = n; 22 | 23 | #pragma omp parallel for schedule(dynamic) 24 | for (int i = 0; i < n; i++) { 25 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 26 | 27 | #pragma vector aligned 28 | #pragma simd 29 | for (int j = tile; j < to; j++) { 30 | float dy = p.y[j] - p.y[i]; 31 | float dz = p.z[j] - p.z[i]; 32 | float dx = p.x[j] - p.x[i]; 33 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 34 | float invDist = 1.0f / sqrtf(distSqr); 35 | float invDist3 = invDist * invDist * invDist; 36 | 37 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 38 | } 39 | 40 | p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz; 41 | } 42 | } 43 | } 44 | 45 | int main(const int argc, const char** argv) { 46 | 47 | int nBodies = 30000; 48 | if (argc > 1) nBodies = atoi(argv[1]); 49 | 50 | int tileSize = 24400; 51 | if (tileSize > nBodies) tileSize = nBodies; 52 | 53 | const float dt = 0.01f; // time step 54 | const int nIters = 10; // simulation iterations 55 | 56 | if ( tileSize % (CACHELINE/sizeof(float)) ) { 57 | printf("ERROR: blockSize not multiple of %d vector elements\n", CACHELINE/(int)sizeof(float)); 58 | exit(1); 59 | } 60 | 61 | int bytes = 6*nBodies*sizeof(float); 62 | float *buf = (float*)_mm_malloc(bytes, CACHELINE); 63 | BodySystem p; 64 | p.x = buf+0*nBodies; p.y = buf+1*nBodies; p.z = buf+2*nBodies; 65 | p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies; 66 | 67 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 68 | 69 | double totalTime = 0.0; 70 | 71 | for (int iter = 1; iter <= nIters; iter++) { 72 | StartTimer(); 73 | 74 | bodyForce(p, dt, nBodies, tileSize); // compute interbody forces 75 | 76 | for (int i = 0 ; i < nBodies; i++) { // integrate position 77 | p.x[i] += p.vx[i]*dt; 78 | p.y[i] += p.vy[i]*dt; 79 | p.z[i] += p.vz[i]*dt; 80 | } 81 | 82 | const double tElapsed = GetTimer() / 1000.0; 83 | if (iter > 1) { // First iter is warm up 84 | totalTime += tElapsed; 85 | } 86 | #ifndef SHMOO 87 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 88 | #endif 89 | } 90 | double avgTime = totalTime / (double)(nIters-1); 91 | 92 | #ifdef SHMOO 93 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 94 | #else 95 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 96 | nIters, rate); 97 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 98 | #endif 99 | _mm_free(buf); 100 | } 101 | -------------------------------------------------------------------------------- /mini-nbody/mic/nbody-block.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define CACHELINE 64 // size of cache line [bytes] 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | 18 | void bodyForce(BodySystem p, float dt, int n, int tileSize) { 19 | for (int tile = 0; tile < n; tile += tileSize) { 20 | int to = tile + tileSize; 21 | if (to > n) to = n; 22 | 23 | #pragma omp parallel for schedule(dynamic) 24 | for (int i = 0; i < n; i++) { 25 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 26 | 27 | for (int j = tile; j < to; j++) { 28 | float dy = p.y[j] - p.y[i]; 29 | float dz = p.z[j] - p.z[i]; 30 | float dx = p.x[j] - p.x[i]; 31 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 32 | float invDist = 1.0f / sqrtf(distSqr); 33 | float invDist3 = invDist * invDist * invDist; 34 | 35 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 36 | } 37 | 38 | p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz; 39 | } 40 | } 41 | } 42 | 43 | int main(const int argc, const char** argv) { 44 | 45 | int nBodies = 30000; 46 | if (argc > 1) nBodies = atoi(argv[1]); 47 | 48 | int tileSize = 24400; 49 | if (tileSize > nBodies) tileSize = nBodies; 50 | 51 | const float dt = 0.01f; // time step 52 | const int nIters = 10; // simulation iterations 53 | 54 | int bytes = 6*nBodies*sizeof(float); 55 | float *buf = (float*)malloc(bytes); 56 | BodySystem p; 57 | p.x = buf+0*nBodies; p.y = buf+1*nBodies; p.z = buf+2*nBodies; 58 | p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies; 59 | 60 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 61 | 62 | double totalTime = 0.0; 63 | 64 | for (int iter = 1; iter <= nIters; iter++) { 65 | StartTimer(); 66 | 67 | bodyForce(p, dt, nBodies, tileSize); // compute interbody forces 68 | 69 | for (int i = 0 ; i < nBodies; i++) { // integrate position 70 | p.x[i] += p.vx[i]*dt; 71 | p.y[i] += p.vy[i]*dt; 72 | p.z[i] += p.vz[i]*dt; 73 | } 74 | 75 | const double tElapsed = GetTimer() / 1000.0; 76 | if (iter > 1) { // First iter is warm up 77 | totalTime += tElapsed; 78 | } 79 | #ifndef SHMOO 80 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 81 | #endif 82 | } 83 | double avgTime = totalTime / (double)(nIters-1); 84 | 85 | #ifdef SHMOO 86 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 87 | #else 88 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 89 | nIters, rate); 90 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 91 | #endif 92 | free(buf); 93 | } 94 | -------------------------------------------------------------------------------- /mini-nbody/mic/nbody-soa.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | 7 | #define SOFTENING 1e-9f 8 | 9 | typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem; 10 | 11 | void randomizeBodies(float *data, int n) { 12 | for (int i = 0; i < n; i++) { 13 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 14 | } 15 | } 16 | 17 | 18 | void bodyForce(BodySystem p, float dt, int n) { 19 | #pragma omp parallel for schedule(dynamic) 20 | for (int i = 0; i < n; i++) { 21 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 22 | 23 | for (int j = 0; j < n; j++) { 24 | float dy = p.y[j] - p.y[i]; 25 | float dz = p.z[j] - p.z[i]; 26 | float dx = p.x[j] - p.x[i]; 27 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 28 | float invDist = 1.0f / sqrtf(distSqr); 29 | float invDist3 = invDist * invDist * invDist; 30 | 31 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 32 | } 33 | 34 | p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz; 35 | } 36 | } 37 | 38 | int main(const int argc, const char** argv) { 39 | 40 | int nBodies = 30000; 41 | if (argc > 1) nBodies = atoi(argv[1]); 42 | 43 | const float dt = 0.01f; // time step 44 | const int nIters = 10; // simulation iterations 45 | 46 | int bytes = 6*nBodies*sizeof(float); 47 | float *buf = (float*)malloc(bytes); 48 | BodySystem p; 49 | p.x = buf+0*nBodies; p.y = buf+1*nBodies; p.z = buf+2*nBodies; 50 | p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies; 51 | 52 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 53 | 54 | double totalTime = 0.0; 55 | 56 | for (int iter = 1; iter <= nIters; iter++) { 57 | StartTimer(); 58 | 59 | bodyForce(p, dt, nBodies); // compute interbody forces 60 | 61 | for (int i = 0 ; i < nBodies; i++) { // integrate position 62 | p.x[i] += p.vx[i]*dt; 63 | p.y[i] += p.vy[i]*dt; 64 | p.z[i] += p.vz[i]*dt; 65 | } 66 | 67 | const double tElapsed = GetTimer() / 1000.0; 68 | if (iter > 1) { // First iter is warm up 69 | totalTime += tElapsed; 70 | } 71 | #ifndef SHMOO 72 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 73 | #endif 74 | } 75 | double avgTime = totalTime / (double)(nIters-1); 76 | 77 | #ifdef SHMOO 78 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 79 | #else 80 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 81 | nIters, rate); 82 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 83 | #endif 84 | free(buf); 85 | } 86 | -------------------------------------------------------------------------------- /mini-nbody/mic/shmoo-mic-nbody-align.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-align.c 2 | EXE=nbody-align-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /mini-nbody/mic/shmoo-mic-nbody-block.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-block.c 2 | EXE=nbody-block-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /mini-nbody/mic/shmoo-mic-nbody-ftz.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.c 2 | EXE=nbody-ftz-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /mini-nbody/mic/shmoo-mic-nbody-orig.sh: -------------------------------------------------------------------------------- 1 | SRC=../nbody-orig.c 2 | EXE=nbody-orig-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -DSHMOO -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /mini-nbody/mic/shmoo-mic-nbody-soa.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.c 2 | EXE=nbody-soa-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /mini-nbody/nbody.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define SOFTENING 1e-9f 7 | 8 | typedef struct { float x, y, z, vx, vy, vz; } Body; 9 | 10 | void randomizeBodies(float *data, int n) { 11 | for (int i = 0; i < n; i++) { 12 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 13 | } 14 | } 15 | 16 | void bodyForce(Body *p, float dt, int n) { 17 | #pragma omp parallel for schedule(dynamic) 18 | for (int i = 0; i < n; i++) { 19 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 20 | 21 | for (int j = 0; j < n; j++) { 22 | float dx = p[j].x - p[i].x; 23 | float dy = p[j].y - p[i].y; 24 | float dz = p[j].z - p[i].z; 25 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 26 | float invDist = 1.0f / sqrtf(distSqr); 27 | float invDist3 = invDist * invDist * invDist; 28 | 29 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 30 | } 31 | 32 | p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; 33 | } 34 | } 35 | 36 | int main(const int argc, const char** argv) { 37 | 38 | int nBodies = 30000; 39 | if (argc > 1) nBodies = atoi(argv[1]); 40 | 41 | const float dt = 0.01f; // time step 42 | const int nIters = 10; // simulation iterations 43 | 44 | int bytes = nBodies*sizeof(Body); 45 | float *buf = (float*)malloc(bytes); 46 | Body *p = (Body*)buf; 47 | 48 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 49 | 50 | double totalTime = 0.0; 51 | 52 | for (int iter = 1; iter <= nIters; iter++) { 53 | StartTimer(); 54 | 55 | bodyForce(p, dt, nBodies); // compute interbody forces 56 | 57 | for (int i = 0 ; i < nBodies; i++) { // integrate position 58 | p[i].x += p[i].vx*dt; 59 | p[i].y += p[i].vy*dt; 60 | p[i].z += p[i].vz*dt; 61 | } 62 | 63 | const double tElapsed = GetTimer() / 1000.0; 64 | if (iter > 1) { // First iter is warm up 65 | totalTime += tElapsed; 66 | } 67 | #ifndef SHMOO 68 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 69 | #endif 70 | } 71 | double avgTime = totalTime / (double)(nIters-1); 72 | 73 | #ifdef SHMOO 74 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 75 | #else 76 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 77 | nIters, rate); 78 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 79 | #endif 80 | free(buf); 81 | } 82 | -------------------------------------------------------------------------------- /mini-nbody/shmoo-cpu-nbody.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody.c 2 | EXE=nbody 3 | gcc -std=c99 -O3 -fopenmp -DSHMOO -o $EXE $SRC -lm 4 | 5 | echo $EXE 6 | 7 | K=1024 8 | for i in {1..10} 9 | do 10 | ./$EXE $K 11 | K=$(($K*2)) 12 | done 13 | 14 | -------------------------------------------------------------------------------- /mini-nbody/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | 6 | #ifdef WIN32 7 | #define WIN32_LEAN_AND_MEAN 8 | #include 9 | #else 10 | #ifndef __USE_BSD 11 | #define __USE_BSD 12 | #endif 13 | #include 14 | #endif 15 | 16 | #ifdef WIN32 17 | double PCFreq = 0.0; 18 | __int64 timerStart = 0; 19 | #else 20 | struct timeval timerStart; 21 | #endif 22 | 23 | void StartTimer() 24 | { 25 | #ifdef WIN32 26 | LARGE_INTEGER li; 27 | if(!QueryPerformanceFrequency(&li)) 28 | printf("QueryPerformanceFrequency failed!\n"); 29 | 30 | PCFreq = (double)li.QuadPart/1000.0; 31 | 32 | QueryPerformanceCounter(&li); 33 | timerStart = li.QuadPart; 34 | #else 35 | gettimeofday(&timerStart, NULL); 36 | #endif 37 | } 38 | 39 | // time elapsed in ms 40 | double GetTimer() 41 | { 42 | #ifdef WIN32 43 | LARGE_INTEGER li; 44 | QueryPerformanceCounter(&li); 45 | return (double)(li.QuadPart-timerStart)/PCFreq; 46 | #else 47 | struct timeval timerStop, timerElapsed; 48 | gettimeofday(&timerStop, NULL); 49 | timersub(&timerStop, &timerStart, &timerElapsed); 50 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 51 | #endif 52 | } 53 | 54 | #endif // TIMER_H 55 | -------------------------------------------------------------------------------- /openmp-helloworld/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(openmp_helloworld) 2 | 3 | cmake_minimum_required(VERSION 3.16) 4 | 5 | # Search for rocm in common locations 6 | if(WIN32) 7 | list(APPEND CMAKE_PREFIX_PATH "C:/hip") 8 | list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD HIP SDK/hip") 9 | else() 10 | list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) 11 | endif() 12 | 13 | # Find HIP. 14 | # The user may override AMDGPU_TARGETS defined in the HIP config file 15 | # to select the AMDGPU archs to compile for. 16 | # ex. set(AMDGPU_TARGETS "gfx803;gfx900;gfx906") 17 | find_package(hip REQUIRED) 18 | 19 | # Find OpenMP. 20 | find_package(OpenMP REQUIRED) 21 | 22 | # Set compiler and linker. 23 | if(NOT WIN32) 24 | set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) 25 | set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) 26 | endif() 27 | 28 | set(CMAKE_BUILD_TYPE Release) 29 | 30 | if(WIN32) 31 | # Compile for OpenMP code (Windows requires this). 32 | set(OpenMP_CXX_FLAGS "-Xclang -fopenmp") 33 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 34 | # Tell CMake where to find the OpenMP libraries (libomp.lib). 35 | link_directories("C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\Llvm\\x64\\lib") 36 | endif() 37 | 38 | # Source files. 39 | set(CPP_SOURCES ${CMAKE_SOURCE_DIR}/openmp_helloworld.cpp) 40 | 41 | # Preparing the executable. 42 | add_executable(test_openmp_helloworld ${CPP_SOURCES}) 43 | 44 | # Link Libraries - HIP Device and OpenMP. 45 | target_compile_options(test_openmp_helloworld PRIVATE ${OpenMP_CXX_FLAGS}) 46 | target_link_libraries(test_openmp_helloworld PRIVATE hip::device ${OpenMP_CXX_FLAGS}) 47 | 48 | if(WIN32) 49 | target_link_libraries(test_openmp_helloworld PRIVATE OpenMP::OpenMP_CXX) 50 | endif() 51 | -------------------------------------------------------------------------------- /openmp-helloworld/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | CXX=$(HIPCC) 5 | CXXFLAGS =-fopenmp 6 | 7 | SOURCES = openmp_helloworld.cpp 8 | 9 | EXECUTABLE=./openmp_helloworld.exe 10 | 11 | .PHONY: test 12 | 13 | 14 | all: $(EXECUTABLE) test 15 | 16 | 17 | $(EXECUTABLE): 18 | $(CXX) $(CXXFLAGS) $(SOURCES) -o $@ 19 | 20 | 21 | test: $(EXECUTABLE) 22 | $(EXECUTABLE) 23 | 24 | 25 | clean: 26 | rm -f $(EXECUTABLE) *.o 27 | 28 | -------------------------------------------------------------------------------- /openmp-helloworld/README.md: -------------------------------------------------------------------------------- 1 | # Simple OpenMP hello world example written directly to the HIP interface. 2 | 3 | ## Requirements 4 | * Installed ROCm 3.9 or newer. See [ROCm Installation Guide](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html). 5 | 6 | ## Windows Requirements 7 | * Set HIP_DIR to the HIP installation location. 8 | * libamdhip64.dll and amd_comgr.dll must be in PATH or in System32. 9 | * Install MS Visual Studio 2019 for C++ development with Optional C++ Clang tools for Windows. 10 | * Ensure libomp.dll from MSVC C++ Clang tools is in PATH (by default, location is C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\Llvm\x64\bin). 11 | * Modify the CMakeLists.txt of this project to the corresponding libomp.lib location. 12 | 13 | ## How to run this code: 14 | 15 | ### Using Make on Linux: 16 | * To build and run: `make`. 17 | * To clean the environment: `make clean`. 18 | 19 | 20 | ### Using CMake on Linux: 21 | * To build: `mkdir -p build; cd build; cmake ..; make` 22 | * To run the test: `./test_openmp_helloworld` 23 | * To clean the build environment: `make clean` 24 | 25 | ### Using CMake on Windows: 26 | * CMake Command: `cmake -G Ninja -DCMAKE_C_COMPILER=/bin/clang.exe -DCMAKE_CXX_COMPILER=/bin/clang++.exe` 27 | * To build: `ninja` 28 | * To run the test: `./test_openmp_helloworld` 29 | 30 | **Note:** You may override `AMDGPU_TARGETS` in the HIP config file by modifying the CMakeLists.txt. 31 | 32 | ## Expected Results: 33 | ``` 34 | info: running on device Device 66a3 35 | Hello World... from OMP thread = 0 36 | Hello World... from OMP thread = 15 37 | Hello World... from OMP thread = 3 38 | Hello World... from OMP thread = 13 39 | Hello World... from OMP thread = 11 40 | Hello World... from OMP thread = 8 41 | Hello World... from OMP thread = 4 42 | Hello World... from OMP thread = 1 43 | Hello World... from OMP thread = 10 44 | Hello World... from OMP thread = 9 45 | Hello World... from OMP thread = 7 46 | Hello World... from OMP thread = 12 47 | Hello World... from OMP thread = 6 48 | Hello World... from OMP thread = 14 49 | Hello World... from OMP thread = 5 50 | Hello World... from OMP thread = 2 51 | Hello World... from HIP thread = 0 52 | Hello World... from HIP thread = 2 53 | Hello World... from HIP thread = 5 54 | Hello World... from HIP thread = 14 55 | Hello World... from HIP thread = 6 56 | Hello World... from HIP thread = 12 57 | Hello World... from HIP thread = 7 58 | Hello World... from HIP thread = 9 59 | Hello World... from HIP thread = 1 60 | Hello World... from HIP thread = 11 61 | Hello World... from HIP thread = 10 62 | Hello World... from HIP thread = 4 63 | Hello World... from HIP thread = 8 64 | Hello World... from HIP thread = 13 65 | Hello World... from HIP thread = 15 66 | Hello World... from HIP thread = 3 67 | Device Results: 68 | A_d[0] = 0 69 | A_d[1] = 1 70 | A_d[2] = 2 71 | A_d[3] = 3 72 | A_d[4] = 4 73 | A_d[5] = 5 74 | A_d[6] = 6 75 | A_d[7] = 7 76 | A_d[8] = 8 77 | A_d[9] = 9 78 | A_d[10] = 10 79 | A_d[11] = 11 80 | A_d[12] = 12 81 | A_d[13] = 13 82 | A_d[14] = 14 83 | A_d[15] = 15 84 | PASSED! 85 | ``` 86 | 87 | **Note:** HIP thread's printf may not display on builds with printf support disabled. 88 | -------------------------------------------------------------------------------- /openmp-helloworld/openmp_helloworld.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | // OpenMP program to print Hello World 24 | // using C language is supported by HIP 25 | 26 | // HIP header 27 | #include 28 | 29 | #include 30 | #include 31 | 32 | //OpenMP header 33 | #include 34 | 35 | #define NUM_THREADS 16 36 | #define CHECK(cmd) \ 37 | {\ 38 | hipError_t error = cmd;\ 39 | if (error != hipSuccess) { \ 40 | fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ 41 | exit(EXIT_FAILURE);\ 42 | }\ 43 | } 44 | 45 | __global__ void 46 | hip_helloworld(unsigned omp_id, int* A_d) 47 | { 48 | // Note: the printf command will only work if printf is enabled in your build. 49 | printf("Hello World... from HIP thread = %u\n", omp_id); 50 | 51 | A_d[omp_id] = omp_id; 52 | } 53 | 54 | int main(int argc, char* argv[]) 55 | { 56 | int* A_h, * A_d; 57 | size_t Nbytes = NUM_THREADS * sizeof(int); 58 | 59 | hipDeviceProp_t props; 60 | CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); 61 | printf("info: running on device %s\n", props.name); 62 | 63 | A_h = (int*)malloc(Nbytes); 64 | CHECK(hipMalloc(&A_d, Nbytes)); 65 | for (int i = 0; i < NUM_THREADS; i++) { 66 | A_h[i] = 0; 67 | } 68 | CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); 69 | 70 | // Beginning of parallel region 71 | #pragma omp parallel num_threads(NUM_THREADS) 72 | { 73 | fprintf(stderr, "Hello World... from OMP thread = %d\n", 74 | omp_get_thread_num()); 75 | 76 | hipLaunchKernelGGL(hip_helloworld, dim3(1), dim3(1), 0, 0, omp_get_thread_num(), A_d); 77 | } 78 | // Ending of parallel region 79 | 80 | hipStreamSynchronize(0); 81 | CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); 82 | printf("Device Results:\n"); 83 | for (int i = 0; i < NUM_THREADS; i++) { 84 | printf(" A_d[%d] = %d\n", i, A_h[i]); 85 | } 86 | 87 | printf ("PASSED!\n"); 88 | 89 | free(A_h); 90 | CHECK(hipFree(A_d)); 91 | return 0; 92 | } 93 | -------------------------------------------------------------------------------- /reduction/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | CXXFLAGS += -std=c++11 -O3 5 | 6 | reduction: reduction.cpp 7 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) 8 | ${HIPCC} ${CXXFLAGS} -o $@ $^ 9 | else 10 | $(error "Cannot find $(HIPCC), please install HIP toolkit") 11 | endif 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | rm -f reduction *.o 17 | -------------------------------------------------------------------------------- /reduction/README.md: -------------------------------------------------------------------------------- 1 | # reduction 2 | reduction example with atomic_add usig HIP. 3 | To build: 4 | make 5 | To execute: 6 | ./run.sh 7 | -------------------------------------------------------------------------------- /reduction/reduction.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "hip/hip_runtime.h" 32 | 33 | 34 | void check_hip_error(void) 35 | { 36 | hipError_t err = hipGetLastError(); 37 | if (err != hipSuccess) 38 | { 39 | std::cerr 40 | << "Error: " 41 | << hipGetErrorString(err) 42 | << std::endl; 43 | exit(err); 44 | } 45 | } 46 | 47 | __global__ void atomic_reduction_kernel(int *in, int* out, int ARRAYSIZE) { 48 | int sum=int(0); 49 | int idx = hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x; 50 | for(int i= idx;i >(t2 - t1).count(); 129 | float GB=(float)ARRAYSIZE*sizeof(int)*N; 130 | std::cout 131 | << "The average performance of reduction is "<< 1.0E-09 * GB/times<<" GBytes/sec"< /dev/null; echo $$?), 0) 8 | ${HIPCC} ${CXXFLAGS} -o $@ $^ 9 | else 10 | $(error "Cannot find $(HIPCC), please install HIP toolkit") 11 | endif 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | rm -f rtm8 *.o 17 | -------------------------------------------------------------------------------- /rtm8/README.md: -------------------------------------------------------------------------------- 1 | rtm8 is an example ported from an Fortran algorithm contributed by Morton, Scott from HESS company. 2 | The original Fortran version: 3 | ./build_fortran.sh 4 | ./rtm8_fortran 5 | The HIP version: 6 | ./build_hip.sh 7 | ./rtm8_hip 8 | The CUDA version: 9 | ./build_cuda.sh 10 | ./rtm8_cuda 11 | -------------------------------------------------------------------------------- /rtm8/build_cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -f "rtm8_cuda" ] 3 | then 4 | rm rtm8_cuda 5 | fi 6 | echo "nvcc -O3 rtm8.cpp -o rtm8_cuda" 7 | nvcc -O3 rtm8.cu -o rtm8_cuda 8 | -------------------------------------------------------------------------------- /rtm8/build_fortran.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -f "rtm8_fortran" ] 3 | then 4 | rm rtm8_fortran 5 | fi 6 | gfortran -c rtm8.f 7 | gcc -c -DUNDERSCORE mysecond.c 8 | gfortran -o rtm8_fortran rtm8.o mysecond.o 9 | -------------------------------------------------------------------------------- /rtm8/build_hip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$HIP_PATH" ] 4 | then 5 | 6 | if [ -d /opt/rocm/hip ] 7 | then 8 | HIP_PATH=/opt/rocm/hip 9 | else 10 | HIP_PATH=/opt/rocm 11 | fi 12 | 13 | fi 14 | 15 | if [ -f "rtm8_hip" ] 16 | then 17 | rm rtm8_hip 18 | fi 19 | 20 | echo "hipcc -std=c++11 -O3 -o rtm8_hip rtm8.cpp" 21 | $HIP_PATH/bin/hipcc -std=c++11 -O3 -o rtm8_hip rtm8.cpp 22 | 23 | -------------------------------------------------------------------------------- /rtm8/mysecond.c: -------------------------------------------------------------------------------- 1 | /* A gettimeofday routine to give access to the wall 2 | clock timer on most UNIX-like systems. 3 | 4 | You will need to compile with "-DUNDERSCORE" 5 | to get this to link with FORTRAN on many systems. 6 | */ 7 | 8 | #include 9 | /* int gettimeofday(struct timeval *tp, struct timezone *tzp); */ 10 | 11 | #ifdef UNDERSCORE 12 | double mysecond_() 13 | #else 14 | double mysecond() 15 | #endif 16 | { 17 | /* struct timeval { long tv_sec; 18 | long tv_usec; }; 19 | 20 | struct timezone { int tz_minuteswest; 21 | int tz_dsttime; }; */ 22 | 23 | struct timeval tp; 24 | struct timezone tzp; 25 | int i; 26 | 27 | i = gettimeofday(&tp,&tzp); 28 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 29 | } 30 | 31 | -------------------------------------------------------------------------------- /rtm8/rtm8.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include "hip/hip_runtime.h" 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "mysecond.c" 30 | 31 | #define nt 30 32 | #define nx 680 33 | #define ny 134 34 | #define nz 450 35 | 36 | inline __host__ __device__ int indexTo1D(int x, int y, int z){ 37 | return x + y*ny + z*ny*nz; 38 | } 39 | 40 | __global__ void 41 | rtm8(float* vsq, float* current_s, float* current_r, float* next_s, float* next_r, float* image, float* a, size_t N) 42 | { 43 | unsigned x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; 44 | unsigned y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; 45 | unsigned z = hipBlockIdx_z*hipBlockDim_z + hipThreadIdx_z; 46 | float div; 47 | if ((4 <= x && x < (nx - 4) ) && (4 <= y && y < (ny - 4)) && (4 <= z && z < (nz - 4))){ 48 | div = 49 | a[0] * current_s[indexTo1D(x,y,z)] + 50 | a[1] * (current_s[indexTo1D(x+1,y,z)] + current_s[indexTo1D(x-1,y,z)] + 51 | current_s[indexTo1D(x,y+1,z)] + current_s[indexTo1D(x,y-1,z)] + 52 | current_s[indexTo1D(x,y,z+1)] + current_s[indexTo1D(x,y,z-1)]) + 53 | a[2] * (current_s[indexTo1D(x+2,y,z)] + current_s[indexTo1D(x-2,y,z)] + 54 | current_s[indexTo1D(x,y+2,z)] + current_s[indexTo1D(x,y-2,z)] + 55 | current_s[indexTo1D(x,y,z+2)] + current_s[indexTo1D(x,y,z-2)]) + 56 | a[3] * (current_s[indexTo1D(x+3,y,z)] + current_s[indexTo1D(x-3,y,z)] + 57 | current_s[indexTo1D(x,y+3,z)] + current_s[indexTo1D(x,y-3,z)] + 58 | current_s[indexTo1D(x,y,z+3)] + current_s[indexTo1D(x,y,z-3)]) + 59 | a[4] * (current_s[indexTo1D(x+4,y,z)] + current_s[indexTo1D(x-4,y,z)] + 60 | current_s[indexTo1D(x,y+4,z)] + current_s[indexTo1D(x,y-4,z)] + 61 | current_s[indexTo1D(x,y,z+4)] + current_s[indexTo1D(x,y,z-4)]); 62 | 63 | next_s[indexTo1D(x,y,z)] = 2*current_s[indexTo1D(x,y,z)] - next_s[indexTo1D(x,y,z)] 64 | + vsq[indexTo1D(x,y,z)]*div; 65 | div = 66 | a[0] * current_r[indexTo1D(x,y,z)] + 67 | a[1] * (current_r[indexTo1D(x+1,y,z)] + current_r[indexTo1D(x-1,y,z)] + 68 | current_r[indexTo1D(x,y+1,z)] + current_r[indexTo1D(x,y-1,z)] + 69 | current_r[indexTo1D(x,y,z+1)] + current_r[indexTo1D(x,y,z-1)]) + 70 | a[2] * (current_r[indexTo1D(x+2,y,z)] + current_r[indexTo1D(x-2,y,z)] + 71 | current_r[indexTo1D(x,y+2,z)] + current_r[indexTo1D(x,y-2,z)] + 72 | current_r[indexTo1D(x,y,z+2)] + current_r[indexTo1D(x,y,z-2)]) + 73 | a[3] * (current_r[indexTo1D(x+3,y,z)] + current_r[indexTo1D(x-3,y,z)] + 74 | current_r[indexTo1D(x,y+3,z)] + current_r[indexTo1D(x,y-3,z)] + 75 | current_r[indexTo1D(x,y,z+3)] + current_r[indexTo1D(x,y,z-3)]) + 76 | a[4] * (current_r[indexTo1D(x+4,y,z)] + current_r[indexTo1D(x-4,y,z)] + 77 | current_r[indexTo1D(x,y+4,z)] + current_r[indexTo1D(x,y-4,z)] + 78 | current_r[indexTo1D(x,y,z+4)] + current_r[indexTo1D(x,y,z-4)]); 79 | 80 | next_r[indexTo1D(x,y,z)] = 2 * current_r[indexTo1D(x,y,z)] 81 | - next_r[indexTo1D(x,y,z)] + vsq[indexTo1D(x,y,z)] * div; 82 | 83 | image[indexTo1D(x,y,z)] = next_s[indexTo1D(x,y,z)] * next_r[indexTo1D(x,y,z)]; 84 | } 85 | } 86 | 87 | // Code to check HIP errors 88 | void check_hip_error(void) 89 | { 90 | hipError_t err = hipGetLastError(); 91 | if (err != hipSuccess) 92 | { 93 | std::cerr 94 | << "Error: " 95 | << hipGetErrorString(err) 96 | << std::endl; 97 | exit(err); 98 | } 99 | } 100 | 101 | 102 | int main(){ 103 | const int ArraySize = nx + nx*ny + nx*ny*nz; 104 | 105 | float* next_s = (float*)malloc(ArraySize * sizeof(float)); 106 | float* current_s = (float*)malloc(ArraySize * sizeof(float)); 107 | float* next_r = (float*)malloc(ArraySize * sizeof(float)); 108 | float* current_r = (float*)malloc(ArraySize * sizeof(float)); 109 | float* vsq = (float*)malloc(ArraySize * sizeof(float)); 110 | float* image = (float*)malloc(ArraySize * sizeof(float)); 111 | 112 | float a[5]; 113 | 114 | double pts, t0, t1, dt, flops, pt_rate, flop_rate, speedup, memory; 115 | 116 | memory = nx*ny*nz*4*6; 117 | pts = nt; 118 | pts = pts*(nx-8)*(ny-8)*(nz-8); 119 | flops = 67*pts; 120 | printf("memory (MB) = %f\n", memory/1e6); 121 | printf("pts (billions) = %f\n", pts/1e9); 122 | printf("Tflops = %f\n", flops/1e12); 123 | 124 | // Initialization of matrix 125 | a[0] = -1./560.; 126 | a[1] = 8./315; 127 | a[2] = -0.2; 128 | a[3] = 1.6; 129 | a[4] = -1435./504.; 130 | 131 | for (int z = 0; z < nz; z++) { 132 | for (int y = 0; y < ny; y++) { 133 | for (int x = 0; x < nx; x++) { 134 | vsq[indexTo1D(x,y,z)] = 1.0; 135 | next_s[indexTo1D(x,y,z)] = 0; 136 | current_s[indexTo1D(x,y,z)] = 0; 137 | next_r[indexTo1D(x,y,z)] = 0; 138 | current_r[indexTo1D(x,y,z)] = 0; 139 | image[indexTo1D(x,y,z)] = 0; 140 | } 141 | } 142 | } 143 | 144 | t0 = mysecond(); 145 | //allocate and copy matrix to device 146 | float* vsq_d; 147 | float* next_s_d; 148 | float* current_s_d; 149 | float* next_r_d; 150 | float* current_r_d; 151 | float* image_d; 152 | float* a_d; 153 | 154 | hipMalloc(&vsq_d, ArraySize * sizeof(float)); 155 | hipMalloc(&next_s_d, ArraySize * sizeof(float)); 156 | hipMalloc(¤t_s_d, ArraySize * sizeof(float)); 157 | hipMalloc(&next_r_d, ArraySize * sizeof(float)); 158 | hipMalloc(¤t_r_d, ArraySize * sizeof(float)); 159 | hipMalloc(&image_d, ArraySize * sizeof(float)); 160 | hipMalloc(&a_d, 5 * sizeof(float)); 161 | check_hip_error(); 162 | hipMemcpy(vsq_d, vsq, ArraySize * sizeof(float), hipMemcpyHostToDevice); 163 | hipMemcpy(next_s_d, next_s, ArraySize * sizeof(float), hipMemcpyHostToDevice); 164 | hipMemcpy(current_s_d, current_s, ArraySize * sizeof(float), hipMemcpyHostToDevice); 165 | hipMemcpy(next_r_d, next_r, ArraySize * sizeof(float), hipMemcpyHostToDevice); 166 | hipMemcpy(current_r_d, current_r, ArraySize * sizeof(float), hipMemcpyHostToDevice); 167 | hipMemcpy(image_d, image, ArraySize * sizeof(float), hipMemcpyHostToDevice); 168 | hipMemcpy(a_d, a, 5 * sizeof(float), hipMemcpyHostToDevice); 169 | check_hip_error(); 170 | // Make sure the copies are finished 171 | hipDeviceSynchronize(); 172 | check_hip_error(); 173 | 174 | int gridSize = 256*256; 175 | int groupSize = 256; 176 | 177 | 178 | for (int t = 0; t < nt; t++) { 179 | //Launch the HIP kernel 180 | hipLaunchKernelGGL(rtm8, dim3(gridSize), dim3(groupSize), 0, 0, (float*)vsq_d, (float*)current_s_d, ( 181 | float*)next_s_d, (float*)current_r_d,(float*)next_r_d, (float*)image_d, (float*)a_d, ArraySize); 182 | } 183 | //copy back image value 184 | hipMemcpy(image, image_d,ArraySize * sizeof(float), hipMemcpyDeviceToHost); 185 | hipDeviceSynchronize(); 186 | t1 = mysecond(); 187 | 188 | dt = t1 - t0; 189 | pt_rate = pts/dt; 190 | flop_rate = flops/dt; 191 | speedup = 2*pow(10, 9)/3/pt_rate; 192 | printf("dt = %f\n", dt); 193 | printf("pt_rate (millions/sec) = %f\n", pt_rate/1e6); 194 | printf("flop_rate (Gflops) = %f\n", flop_rate/1e9); 195 | printf("speedup = %f\n", speedup); 196 | 197 | //release arrays 198 | free(vsq); 199 | free(next_s); 200 | free(current_s); 201 | free(next_r); 202 | free(current_r); 203 | free(image); 204 | return 0; 205 | 206 | } 207 | 208 | -------------------------------------------------------------------------------- /rtm8/rtm8.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "mysecond.c" 31 | 32 | #define nt 30 33 | #define nx 680 34 | #define ny 134 35 | #define nz 450 36 | 37 | inline __host__ __device__ int indexTo1D(int x, int y, int z){ 38 | return x + y*ny + z*ny*nz; 39 | } 40 | 41 | __global__ void 42 | rtm8(float* vsq, float* current_s, float* current_r, float* next_s, float* next_r, float* image, float* a, size_t N) 43 | { 44 | unsigned x = blockIdx.x * blockDim.x + threadIdx.x; 45 | unsigned y = blockIdx.y * blockDim.y + threadIdx.y; 46 | unsigned z = blockIdx.z * blockDim.z + threadIdx.z; 47 | float div; 48 | if ((4 <= x && x < (nx - 4) ) && (4 <= y && y < (ny - 4)) && (4 <= z && z < (nz - 4))){ 49 | div = 50 | a[0] * current_s[indexTo1D(x,y,z)] + 51 | a[1] * (current_s[indexTo1D(x+1,y,z)] + current_s[indexTo1D(x-1,y,z)] + 52 | current_s[indexTo1D(x,y+1,z)] + current_s[indexTo1D(x,y-1,z)] + 53 | current_s[indexTo1D(x,y,z+1)] + current_s[indexTo1D(x,y,z-1)]) + 54 | a[2] * (current_s[indexTo1D(x+2,y,z)] + current_s[indexTo1D(x-2,y,z)] + 55 | current_s[indexTo1D(x,y+2,z)] + current_s[indexTo1D(x,y-2,z)] + 56 | current_s[indexTo1D(x,y,z+2)] + current_s[indexTo1D(x,y,z-2)]) + 57 | a[3] * (current_s[indexTo1D(x+3,y,z)] + current_s[indexTo1D(x-3,y,z)] + 58 | current_s[indexTo1D(x,y+3,z)] + current_s[indexTo1D(x,y-3,z)] + 59 | current_s[indexTo1D(x,y,z+3)] + current_s[indexTo1D(x,y,z-3)]) + 60 | a[4] * (current_s[indexTo1D(x+4,y,z)] + current_s[indexTo1D(x-4,y,z)] + 61 | current_s[indexTo1D(x,y+4,z)] + current_s[indexTo1D(x,y-4,z)] + 62 | current_s[indexTo1D(x,y,z+4)] + current_s[indexTo1D(x,y,z-4)]); 63 | 64 | next_s[indexTo1D(x,y,z)] = 2*current_s[indexTo1D(x,y,z)] - next_s[indexTo1D(x,y,z)] 65 | + vsq[indexTo1D(x,y,z)]*div; 66 | div = 67 | a[0] * current_r[indexTo1D(x,y,z)] + 68 | a[1] * (current_r[indexTo1D(x+1,y,z)] + current_r[indexTo1D(x-1,y,z)] + 69 | current_r[indexTo1D(x,y+1,z)] + current_r[indexTo1D(x,y-1,z)] + 70 | current_r[indexTo1D(x,y,z+1)] + current_r[indexTo1D(x,y,z-1)]) + 71 | a[2] * (current_r[indexTo1D(x+2,y,z)] + current_r[indexTo1D(x-2,y,z)] + 72 | current_r[indexTo1D(x,y+2,z)] + current_r[indexTo1D(x,y-2,z)] + 73 | current_r[indexTo1D(x,y,z+2)] + current_r[indexTo1D(x,y,z-2)]) + 74 | a[3] * (current_r[indexTo1D(x+3,y,z)] + current_r[indexTo1D(x-3,y,z)] + 75 | current_r[indexTo1D(x,y+3,z)] + current_r[indexTo1D(x,y-3,z)] + 76 | current_r[indexTo1D(x,y,z+3)] + current_r[indexTo1D(x,y,z-3)]) + 77 | a[4] * (current_r[indexTo1D(x+4,y,z)] + current_r[indexTo1D(x-4,y,z)] + 78 | current_r[indexTo1D(x,y+4,z)] + current_r[indexTo1D(x,y-4,z)] + 79 | current_r[indexTo1D(x,y,z+4)] + current_r[indexTo1D(x,y,z-4)]); 80 | 81 | next_r[indexTo1D(x,y,z)] = 2 * current_r[indexTo1D(x,y,z)] 82 | - next_r[indexTo1D(x,y,z)] + vsq[indexTo1D(x,y,z)] * div; 83 | 84 | image[indexTo1D(x,y,z)] = next_s[indexTo1D(x,y,z)] * next_r[indexTo1D(x,y,z)]; 85 | } 86 | } 87 | 88 | // Code to check CUDA errors 89 | void check_cuda_error(void) 90 | { 91 | cudaError_t err = cudaGetLastError(); 92 | if (err != cudaSuccess) 93 | { 94 | std::cerr 95 | << "Error: " 96 | << cudaGetErrorString(err) 97 | << std::endl; 98 | exit(err); 99 | } 100 | } 101 | 102 | int main(){ 103 | const int ArraySize = nx + nx*ny + nx*ny*nz; 104 | 105 | float* next_s = (float*)malloc(ArraySize * sizeof(float)); 106 | float* current_s = (float*)malloc(ArraySize * sizeof(float)); 107 | float* next_r = (float*)malloc(ArraySize * sizeof(float)); 108 | float* current_r = (float*)malloc(ArraySize * sizeof(float)); 109 | float* vsq = (float*)malloc(ArraySize * sizeof(float)); 110 | float* image = (float*)malloc(ArraySize * sizeof(float)); 111 | 112 | float a[5]; 113 | 114 | double pts, t0, t1, dt, flops, pt_rate, flop_rate, speedup, memory; 115 | 116 | memory = nx*ny*nz*4*6; 117 | pts = nt; 118 | pts = pts*(nx-8)*(ny-8)*(nz-8); 119 | flops = 67*pts; 120 | printf("memory (MB) = %f\n", memory/1e6); 121 | printf("pts (billions) = %f\n", pts/1e9); 122 | printf("Tflops = %f\n", flops/1e12); 123 | 124 | // Initialization of matrix 125 | a[0] = -1./560.; 126 | a[1] = 8./315; 127 | a[2] = -0.2; 128 | a[3] = 1.6; 129 | a[4] = -1435./504.; 130 | 131 | for (int z = 0; z < nz; z++) { 132 | for (int y = 0; y < ny; y++) { 133 | for (int x = 0; x < nx; x++) { 134 | vsq[indexTo1D(x,y,z)] = 1.0; 135 | next_s[indexTo1D(x,y,z)] = 0; 136 | current_s[indexTo1D(x,y,z)] = 0; 137 | next_r[indexTo1D(x,y,z)] = 0; 138 | current_r[indexTo1D(x,y,z)] = 0; 139 | image[indexTo1D(x,y,z)] = 0; 140 | } 141 | } 142 | } 143 | 144 | t0 = mysecond(); 145 | //allocate and copy matrix to device 146 | float* vsq_d; 147 | float* next_s_d; 148 | float* current_s_d; 149 | float* next_r_d; 150 | float* current_r_d; 151 | float* image_d; 152 | float* a_d; 153 | 154 | cudaMalloc(&vsq_d, ArraySize * sizeof(float)); 155 | cudaMalloc(&next_s_d, ArraySize * sizeof(float)); 156 | cudaMalloc(¤t_s_d, ArraySize * sizeof(float)); 157 | cudaMalloc(&next_r_d, ArraySize * sizeof(float)); 158 | cudaMalloc(¤t_r_d, ArraySize * sizeof(float)); 159 | cudaMalloc(&image_d, ArraySize * sizeof(float)); 160 | cudaMalloc(&a_d, 5 * sizeof(float)); 161 | check_cuda_error(); 162 | cudaMemcpy(vsq_d, vsq, ArraySize * sizeof(float), cudaMemcpyHostToDevice); 163 | cudaMemcpy(next_s_d, next_s, ArraySize * sizeof(float), cudaMemcpyHostToDevice); 164 | cudaMemcpy(current_s_d, current_s, ArraySize * sizeof(float), cudaMemcpyHostToDevice); 165 | cudaMemcpy(next_r_d, next_r, ArraySize * sizeof(float), cudaMemcpyHostToDevice); 166 | cudaMemcpy(current_r_d, current_r, ArraySize * sizeof(float), cudaMemcpyHostToDevice); 167 | cudaMemcpy(image_d, image, ArraySize * sizeof(float), cudaMemcpyHostToDevice); 168 | cudaMemcpy(a_d, a, 5 * sizeof(float), cudaMemcpyHostToDevice); 169 | check_cuda_error(); 170 | // Make sure the copies are finished 171 | cudaDeviceSynchronize(); 172 | check_cuda_error(); 173 | 174 | int gridSize = 256*256; 175 | int groupSize = 256; 176 | 177 | 178 | for (int t = 0; t < nt; t++) { 179 | //Launch the HIP kernel 180 | rtm8<<>>((float*)vsq_d, (float*)current_s_d, (float*)next_s_d, (float*)current_r_d,(float*)next_r_d, (float*)image_d, (float*)a_d, ArraySize); 181 | } 182 | //copy back image value 183 | cudaMemcpy(image, image_d,ArraySize * sizeof(float), cudaMemcpyDeviceToHost); 184 | cudaDeviceSynchronize(); 185 | t1 = mysecond(); 186 | 187 | dt = t1 - t0; 188 | pt_rate = pts/dt; 189 | flop_rate = flops/dt; 190 | speedup = 2*pow(10, 9)/3/pt_rate; 191 | printf("dt = %f\n", dt); 192 | printf("pt_rate (millions/sec) = %f\n", pt_rate/1e6); 193 | printf("flop_rate (Gflops) = %f\n", flop_rate/1e9); 194 | printf("speedup = %f\n", speedup); 195 | 196 | //release arrays 197 | free(vsq); 198 | free(next_s); 199 | free(current_s); 200 | free(next_r); 201 | free(current_r); 202 | free(image); 203 | return 0; 204 | } 205 | 206 | -------------------------------------------------------------------------------- /rtm8/rtm8.f: -------------------------------------------------------------------------------- 1 | program rtm8 2 | implicit none 3 | integer n, nt, nx, ny, nz 4 | c parameter( nt=100, nx=400, ny=400, nz=300 ) 5 | c parameter( nt=100, nx=400, ny=100, nz=300 ) 6 | parameter( nt=30, nx=680, ny=134, nz=450 ) 7 | real next_s(nx,ny,nz), current_s(nx,ny,nz) 8 | real next_r(nx,ny,nz), current_r(nx,ny,nz) 9 | real vsq(nx,ny,nz), image(nx,ny,nz) 10 | real a(5) 11 | external mysecond 12 | real*8 mysecond 13 | c 14 | integer t, x, y, z 15 | real*8 pts, t0, t1, dt, flops, pt_rate, flop_rate, speedup, memory 16 | real div 17 | c 18 | memory = nx*ny*nz*4*6 19 | pts = nt 20 | pts = pts*(nx-8)*(ny-8)*(nz-8) 21 | flops = 67.*pts 22 | print *, 'memory (MB) = ', memory/1e6 23 | print *, 'pts (billions) = ', pts/1e9 24 | print *, 'Tflops = ', flops/1e12 25 | c 26 | a(1) = -1./560. 27 | a(2) = 8./315 28 | a(3) = -0.2 29 | a(4) = 1.6 30 | a(5) = -1435./504. 31 | c 32 | !$omp parallel 33 | !$omp do 34 | do z = 1, nz 35 | do y = 1, ny 36 | do x = 1, nx 37 | vsq(x,y,z) = 1 38 | next_s(x,y,z) = 0 39 | current_s(x,y,z) = 0 40 | next_r(x,y,z) = 0 41 | current_r(x,y,z) = 0 42 | image(x,y,z) = 0 43 | enddo 44 | enddo 45 | enddo 46 | !$omp enddo 47 | !$omp end parallel 48 | c 49 | t0 = mysecond() 50 | do t = 1, nt 51 | do z = 5, nz-4 52 | do y = 5, ny-4 53 | do x = 5, nx-4 54 | div = 55 | & a(1)* current_s(x,y,z) + 56 | & a(2)*( current_s(x+1,y,z) + current_s(x-1,y,z) + 57 | & current_s(x,y+1,z) + current_s(x,y-1,z) + 58 | & current_s(x,y,z+1) + current_s(x,y,z-1) ) + 59 | & a(3)*( current_s(x+2,y,z) + current_s(x-2,y,z) + 60 | & current_s(x,y+2,z) + current_s(x,y-2,z) + 61 | & current_s(x,y,z+2) + current_s(x,y,z-2) ) + 62 | & a(4)*( current_s(x+3,y,z) + current_s(x-3,y,z) + 63 | & current_s(x,y+3,z) + current_s(x,y-3,z) + 64 | & current_s(x,y,z+3) + current_s(x,y,z-3) ) + 65 | & a(5)*( current_s(x+4,y,z) + current_s(x-4,y,z) + 66 | & current_s(x,y+4,z) + current_s(x,y-4,z) + 67 | & current_s(x,y,z+4) + current_s(x,y,z-4) ) 68 | next_s(x,y,z) = 2.*current_s(x,y,z) 69 | & - next_s(x,y,z) + vsq(x,y,z)* div 70 | div = 71 | & a(1)* current_r(x,y,z) + 72 | & a(2)*( current_r(x+1,y,z) + current_r(x-1,y,z) + 73 | & current_r(x,y+1,z) + current_r(x,y-1,z) + 74 | & current_r(x,y,z+1) + current_r(x,y,z-1) ) + 75 | & a(3)*( current_r(x+2,y,z) + current_r(x-2,y,z) + 76 | & current_r(x,y+2,z) + current_r(x,y-2,z) + 77 | & current_r(x,y,z+2) + current_r(x,y,z-2) ) + 78 | & a(4)*( current_r(x+3,y,z) + current_r(x-3,y,z) + 79 | & current_r(x,y+3,z) + current_r(x,y-3,z) + 80 | & current_r(x,y,z+3) + current_r(x,y,z-3) ) + 81 | & a(5)*( current_r(x+4,y,z) + current_r(x-4,y,z) + 82 | & current_r(x,y+4,z) + current_r(x,y-4,z) + 83 | & current_r(x,y,z+4) + current_r(x,y,z-4) ) 84 | next_r(x,y,z) = 2.*current_r(x,y,z) 85 | & - next_r(x,y,z) + vsq(x,y,z)* div 86 | image(x,y,z) = next_s(x,y,z) * next_r(x,y,z) 87 | enddo 88 | enddo 89 | enddo 90 | enddo 91 | t1 = mysecond() 92 | c 93 | dt = t1 - t0 94 | pt_rate = pts/dt 95 | flop_rate = flops/dt 96 | speedup = 2.*10**9/3./pt_rate 97 | print *, 'dt = ', dt 98 | print *, 'pt_rate (millions/sec) = ', pt_rate/1e6 99 | print *, 'flop_rate (Gflops) = ', flop_rate/1e9 100 | print *, 'speedup = ', speedup 101 | c 102 | stop 103 | end 104 | -------------------------------------------------------------------------------- /strided-access/CL/cl_d3d10.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | #ifndef __OPENCL_CL_D3D10_H 27 | #define __OPENCL_CL_D3D10_H 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #ifdef __cplusplus 34 | extern "C" { 35 | #endif 36 | 37 | /****************************************************************************** 38 | * cl_khr_d3d10_sharing */ 39 | #define cl_khr_d3d10_sharing 1 40 | 41 | typedef cl_uint cl_d3d10_device_source_khr; 42 | typedef cl_uint cl_d3d10_device_set_khr; 43 | 44 | /******************************************************************************/ 45 | 46 | // Error Codes 47 | #define CL_INVALID_D3D10_DEVICE_KHR -1002 48 | #define CL_INVALID_D3D10_RESOURCE_KHR -1003 49 | #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 50 | #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 51 | 52 | // cl_d3d10_device_source_nv 53 | #define CL_D3D10_DEVICE_KHR 0x4010 54 | #define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 55 | 56 | // cl_d3d10_device_set_nv 57 | #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 58 | #define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 59 | 60 | // cl_context_info 61 | #define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 62 | #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C 63 | 64 | // cl_mem_info 65 | #define CL_MEM_D3D10_RESOURCE_KHR 0x4015 66 | 67 | // cl_image_info 68 | #define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 69 | 70 | // cl_command_type 71 | #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 72 | #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 73 | 74 | /******************************************************************************/ 75 | 76 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( 77 | cl_platform_id platform, 78 | cl_d3d10_device_source_khr d3d_device_source, 79 | void * d3d_object, 80 | cl_d3d10_device_set_khr d3d_device_set, 81 | cl_uint num_entries, 82 | cl_device_id * devices, 83 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; 84 | 85 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( 86 | cl_context context, 87 | cl_mem_flags flags, 88 | ID3D10Buffer * resource, 89 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 90 | 91 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( 92 | cl_context context, 93 | cl_mem_flags flags, 94 | ID3D10Texture2D * resource, 95 | UINT subresource, 96 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 97 | 98 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( 99 | cl_context context, 100 | cl_mem_flags flags, 101 | ID3D10Texture3D * resource, 102 | UINT subresource, 103 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 104 | 105 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( 106 | cl_command_queue command_queue, 107 | cl_uint num_objects, 108 | const cl_mem * mem_objects, 109 | cl_uint num_events_in_wait_list, 110 | const cl_event * event_wait_list, 111 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 112 | 113 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( 114 | cl_command_queue command_queue, 115 | cl_uint num_objects, 116 | cl_mem * mem_objects, 117 | cl_uint num_events_in_wait_list, 118 | const cl_event * event_wait_list, 119 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 120 | 121 | #ifdef __cplusplus 122 | } 123 | #endif 124 | 125 | #endif // __OPENCL_CL_D3D10_H 126 | 127 | -------------------------------------------------------------------------------- /strided-access/CL/cl_gl.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | /* 27 | * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have 28 | * OpenGL dependencies. The application is responsible for #including 29 | * OpenGL or OpenGL ES headers before #including cl_gl.h. 30 | */ 31 | 32 | #ifndef __OPENCL_CL_GL_H 33 | #define __OPENCL_CL_GL_H 34 | 35 | #ifdef __APPLE__ 36 | #include 37 | #include 38 | #else 39 | #include 40 | #endif 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | typedef cl_uint cl_gl_object_type; 47 | typedef cl_uint cl_gl_texture_info; 48 | typedef cl_uint cl_gl_platform_info; 49 | typedef struct __GLsync *cl_GLsync; 50 | 51 | /* cl_gl_object_type */ 52 | #define CL_GL_OBJECT_BUFFER 0x2000 53 | #define CL_GL_OBJECT_TEXTURE2D 0x2001 54 | #define CL_GL_OBJECT_TEXTURE3D 0x2002 55 | #define CL_GL_OBJECT_RENDERBUFFER 0x2003 56 | 57 | /* cl_gl_texture_info */ 58 | #define CL_GL_TEXTURE_TARGET 0x2004 59 | #define CL_GL_MIPMAP_LEVEL 0x2005 60 | 61 | extern CL_API_ENTRY cl_mem CL_API_CALL 62 | clCreateFromGLBuffer(cl_context /* context */, 63 | cl_mem_flags /* flags */, 64 | cl_GLuint /* bufobj */, 65 | int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 66 | 67 | extern CL_API_ENTRY cl_mem CL_API_CALL 68 | clCreateFromGLTexture2D(cl_context /* context */, 69 | cl_mem_flags /* flags */, 70 | cl_GLenum /* target */, 71 | cl_GLint /* miplevel */, 72 | cl_GLuint /* texture */, 73 | cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 74 | 75 | extern CL_API_ENTRY cl_mem CL_API_CALL 76 | clCreateFromGLTexture3D(cl_context /* context */, 77 | cl_mem_flags /* flags */, 78 | cl_GLenum /* target */, 79 | cl_GLint /* miplevel */, 80 | cl_GLuint /* texture */, 81 | cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 82 | 83 | extern CL_API_ENTRY cl_mem CL_API_CALL 84 | clCreateFromGLRenderbuffer(cl_context /* context */, 85 | cl_mem_flags /* flags */, 86 | cl_GLuint /* renderbuffer */, 87 | cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 88 | 89 | extern CL_API_ENTRY cl_int CL_API_CALL 90 | clGetGLObjectInfo(cl_mem /* memobj */, 91 | cl_gl_object_type * /* gl_object_type */, 92 | cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; 93 | 94 | extern CL_API_ENTRY cl_int CL_API_CALL 95 | clGetGLTextureInfo(cl_mem /* memobj */, 96 | cl_gl_texture_info /* param_name */, 97 | size_t /* param_value_size */, 98 | void * /* param_value */, 99 | size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; 100 | 101 | extern CL_API_ENTRY cl_int CL_API_CALL 102 | clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, 103 | cl_uint /* num_objects */, 104 | const cl_mem * /* mem_objects */, 105 | cl_uint /* num_events_in_wait_list */, 106 | const cl_event * /* event_wait_list */, 107 | cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; 108 | 109 | extern CL_API_ENTRY cl_int CL_API_CALL 110 | clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, 111 | cl_uint /* num_objects */, 112 | const cl_mem * /* mem_objects */, 113 | cl_uint /* num_events_in_wait_list */, 114 | const cl_event * /* event_wait_list */, 115 | cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; 116 | 117 | /* cl_khr_gl_sharing extension */ 118 | 119 | #define cl_khr_gl_sharing 1 120 | 121 | typedef cl_uint cl_gl_context_info; 122 | 123 | /* Additional Error Codes */ 124 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 125 | 126 | /* cl_gl_context_info */ 127 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 128 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 129 | 130 | /* Additional cl_context_properties */ 131 | #define CL_GL_CONTEXT_KHR 0x2008 132 | #define CL_EGL_DISPLAY_KHR 0x2009 133 | #define CL_GLX_DISPLAY_KHR 0x200A 134 | #define CL_WGL_HDC_KHR 0x200B 135 | #define CL_CGL_SHAREGROUP_KHR 0x200C 136 | 137 | extern CL_API_ENTRY cl_int CL_API_CALL 138 | clGetGLContextInfoKHR(const cl_context_properties * /* properties */, 139 | cl_gl_context_info /* param_name */, 140 | size_t /* param_value_size */, 141 | void * /* param_value */, 142 | size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; 143 | 144 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( 145 | const cl_context_properties * properties, 146 | cl_gl_context_info param_name, 147 | size_t param_value_size, 148 | void * param_value, 149 | size_t * param_value_size_ret); 150 | 151 | #ifdef __cplusplus 152 | } 153 | #endif 154 | 155 | #endif /* __OPENCL_CL_GL_H */ 156 | -------------------------------------------------------------------------------- /strided-access/CL/cl_gl_ext.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | /* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ 27 | /* OpenGL dependencies. */ 28 | 29 | #ifndef __OPENCL_CL_GL_EXT_H 30 | #define __OPENCL_CL_GL_EXT_H 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | #ifdef __APPLE__ 37 | #include 38 | #else 39 | #include 40 | #endif 41 | 42 | /* 43 | * For each extension, follow this template 44 | * /* cl_VEN_extname extension */ 45 | /* #define cl_VEN_extname 1 46 | * ... define new types, if any 47 | * ... define new tokens, if any 48 | * ... define new APIs, if any 49 | * 50 | * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header 51 | * This allows us to avoid having to decide whether to include GL headers or GLES here. 52 | */ 53 | 54 | /* 55 | * cl_khr_gl_event extension 56 | * See section 9.9 in the OpenCL 1.1 spec for more information 57 | */ 58 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D 59 | 60 | extern CL_API_ENTRY cl_event CL_API_CALL 61 | clCreateEventFromGLsyncKHR(cl_context /* context */, 62 | cl_GLsync /* cl_GLsync */, 63 | cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; 64 | 65 | #ifdef __cplusplus 66 | } 67 | #endif 68 | 69 | #endif /* __OPENCL_CL_GL_EXT_H */ 70 | -------------------------------------------------------------------------------- /strided-access/CL/opencl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | ******************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | #ifndef __OPENCL_H 27 | #define __OPENCL_H 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #ifdef __APPLE__ 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #else 41 | 42 | #include 43 | #include 44 | #include 45 | #include 46 | 47 | #endif 48 | 49 | #ifdef __cplusplus 50 | } 51 | #endif 52 | 53 | #endif /* __OPENCL_H */ 54 | 55 | -------------------------------------------------------------------------------- /strided-access/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Karl Rupp 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /strided-access/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | CXXFLAGS += -std=c++11 -O3 5 | 6 | strided-access: benchmark-hip.cpp 7 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) 8 | ${HIPCC} ${CXXFLAGS} -o $@ $^ 9 | else 10 | $(error "Cannot find $(HIPCC), please install HIP toolkit") 11 | endif 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | rm -f strided-access *.o 17 | -------------------------------------------------------------------------------- /strided-access/README.txt: -------------------------------------------------------------------------------- 1 | ### 2 | ### README for measuring effective memory bandwidth for strided array access 3 | ### by Karl Rupp 4 | ### 5 | ### Supplements blog post: 6 | ### https://www.karlrupp.net/2016/02/strided-memory-access-on-cpus-gpus-and-mic 7 | ### 8 | 9 | # License 10 | 11 | The code is provided under a permissive MIT/X11-style license. 12 | See file LICENSE.txt for details. 13 | 14 | The results and plotting scripts in folder results/ are provided under the 15 | Creative Commons Attribution 4.0 International (CC BY 4.0) 16 | license, see results/LICENSE.txt 17 | 18 | 19 | # Build 20 | 21 | To build the executable, use (or adjust) one of the following commands to your environment: 22 | 23 | HIP: 24 | $> /opt/rocm/hip/bin/hipcc -std=c++11 -O3 -o hip benchmark-hip.cpp 25 | 26 | CUDA: 27 | $> nvcc benchmark-cuda.cu -arch=sm_20 -I$VIENNACLPATH 28 | 29 | OpenCL: 30 | $> g++ benchmark-opencl.cpp -I. -lOpenCL -L/usr/local/cuda/lib64/ 31 | (If OpenCL is available system-wide, you may be able to omit the -L flag) 32 | 33 | OpenMP: 34 | $> g++ benchmark-openmp.cpp benchmark-openmp2.cpp -I. -O3 -fopenmp 35 | for CPUs or 36 | $> icc benchmark-openmp.cpp benchmark-openmp2.cpp -O3 -fopenmp -mmic 37 | for Xeon Phi 38 | 39 | 40 | # Run 41 | 42 | To run the respective benchmark, issue 43 | $> ./a.out 44 | 45 | 46 | # Plot 47 | 48 | Have a look at the results/ folder, where the data and gnuplot commands are located. 49 | Replot via 50 | $> gnuplot plot.gnuplot 51 | (produces strided-access.eps) 52 | 53 | Convert to .pdf via 54 | $> epstopdf strided-access.eps 55 | and to .png using ImageMagick, e.g. 56 | $> convert -density 300 strided-access.eps -resize 1150x strided-access.png 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /strided-access/benchmark-cuda.cu: -------------------------------------------------------------------------------- 1 | // 2 | // CUDA benchmark for measuring effective memory bandwidth for strided array access 3 | // 4 | // Author: Karl Rupp, me@karlrupp.net 5 | // License: MIT/X11 license, see file LICENSE.txt 6 | // 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | #include "benchmark-utils.hpp" 17 | 18 | 19 | inline void cuda_last_error_check() 20 | { 21 | cudaError_t error_code = cudaGetLastError(); 22 | 23 | if (cudaSuccess != error_code) 24 | { 25 | std::stringstream ss; 26 | ss << "CUDA Runtime API error " << error_code << ": " << cudaGetErrorString( error_code ) << std::endl; 27 | throw std::runtime_error(ss.str()); 28 | } 29 | } 30 | 31 | 32 | // Kernel for the benchmark 33 | template 34 | __global__ void elementwise_add(const NumericT * x, 35 | const NumericT * y, 36 | NumericT * z, 37 | unsigned int stride, 38 | unsigned int size) 39 | { 40 | for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 41 | i < size; 42 | i += gridDim.x * blockDim.x) 43 | z[i*stride] = x[i*stride] + y[i*stride]; 44 | } 45 | 46 | 47 | int main(int argc, char **argv) 48 | { 49 | typedef float NumericT; 50 | 51 | cudaDeviceProp prop; 52 | cudaError_t err = cudaGetDeviceProperties(&prop, 0); if (err != cudaSuccess) throw std::runtime_error("Failed to get CUDA device name"); 53 | std::cout << "# Using device: " << prop.name << std::endl; 54 | 55 | // Set up work vectors 56 | std::size_t N = 1000000; 57 | 58 | std::vector host_x(32*N); 59 | NumericT *x, *y, *z; 60 | 61 | err = cudaMalloc(&x, sizeof(NumericT) * 32 * N); if (err != cudaSuccess) throw std::runtime_error("Failed to allocate CUDA memory for x"); 62 | err = cudaMalloc(&y, sizeof(NumericT) * 32 * N); if (err != cudaSuccess) throw std::runtime_error("Failed to allocate CUDA memory for y"); 63 | err = cudaMalloc(&z, sizeof(NumericT) * 32 * N); if (err != cudaSuccess) throw std::runtime_error("Failed to allocate CUDA memory for z"); 64 | 65 | 66 | // Warmup calculation: 67 | elementwise_add<<<256, 256>>>(x, y, z, 68 | static_cast(1), 69 | static_cast(N)); 70 | cuda_last_error_check(); 71 | 72 | // Benchmark runs 73 | Timer timer; 74 | std::cout << "# stride time GB/sec" << std::endl; 75 | for (std::size_t stride = 1; stride <= 32 ; ++stride) 76 | { 77 | cudaDeviceSynchronize(); 78 | timer.start(); 79 | 80 | // repeat calculation several times, then average 81 | for (std::size_t num_runs = 0; num_runs < 20; ++num_runs) 82 | { 83 | elementwise_add<<<256, 256>>>(x, y, z, 84 | static_cast(stride), 85 | static_cast(N)); 86 | cuda_last_error_check(); 87 | } 88 | cudaDeviceSynchronize(); 89 | double exec_time = timer.get(); 90 | 91 | std::cout << " " << stride << " " << exec_time << " " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl; 92 | } 93 | 94 | return EXIT_SUCCESS; 95 | } 96 | 97 | -------------------------------------------------------------------------------- /strided-access/benchmark-hip.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // HIP benchmark for measuring effective memory bandwidth for strided array access 3 | // 4 | // Author: Karl Rupp, me@karlrupp.net 5 | // License: MIT/X11 license, see file LICENSE.txt 6 | // 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "hip/hip_runtime.h" 15 | 16 | #include "benchmark-utils.hpp" 17 | 18 | 19 | inline void cuda_last_error_check() 20 | { 21 | hipError_t error_code = hipGetLastError(); 22 | 23 | if (hipSuccess != error_code) 24 | { 25 | std::stringstream ss; 26 | ss << "CUDA Runtime API error " << error_code << ": " << hipGetErrorString( error_code ) << std::endl; 27 | throw std::runtime_error(ss.str()); 28 | } 29 | } 30 | 31 | 32 | // Kernel for the benchmark 33 | template 34 | __global__ void elementwise_add(const NumericT * x, 35 | const NumericT * y, 36 | NumericT * z, 37 | unsigned int stride, 38 | unsigned int size) 39 | { 40 | for (unsigned int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; 41 | i < size; 42 | i += hipGridDim_x * hipBlockDim_x) 43 | z[i*stride] = x[i*stride] + y[i*stride]; 44 | } 45 | 46 | 47 | int main(int argc, char **argv) 48 | { 49 | typedef float NumericT; 50 | 51 | hipDeviceProp_t prop; 52 | hipError_t err = hipGetDeviceProperties(&prop, 0); if (err != hipSuccess) throw std::runtime_error("Failed to get CUDA device name"); 53 | std::cout << "# Using device: " << prop.name << std::endl; 54 | 55 | // Set up work vectors 56 | std::size_t N = 1000000; 57 | 58 | std::vector host_x(32*N); 59 | NumericT *x, *y, *z; 60 | 61 | err = hipMalloc(&x, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for x"); 62 | err = hipMalloc(&y, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for y"); 63 | err = hipMalloc(&z, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for z"); 64 | 65 | 66 | // Warmup calculation: 67 | hipLaunchKernelGGL(elementwise_add, dim3(256), dim3(256), 0, 0, x, y, z, 68 | static_cast(1), 69 | static_cast(N)); 70 | cuda_last_error_check(); 71 | 72 | // Benchmark runs 73 | Timer timer; 74 | std::cout << "# stride time GB/sec" << std::endl; 75 | for (std::size_t stride = 0; stride <= 32 ; ++stride) 76 | { 77 | hipDeviceSynchronize(); 78 | timer.start(); 79 | 80 | // repeat calculation several times, then average 81 | for (std::size_t num_runs = 0; num_runs < 20; ++num_runs) 82 | { 83 | hipLaunchKernelGGL(elementwise_add, dim3(256), dim3(256), 0, 0, x, y, z, 84 | static_cast(stride), 85 | static_cast(N)); 86 | cuda_last_error_check(); 87 | } 88 | hipDeviceSynchronize(); 89 | double exec_time = timer.get(); 90 | 91 | std::cout << " " << stride << " " << exec_time << " " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl; 92 | } 93 | 94 | return EXIT_SUCCESS; 95 | } 96 | 97 | -------------------------------------------------------------------------------- /strided-access/benchmark-hip.cu: -------------------------------------------------------------------------------- 1 | // 2 | // CUDA benchmark for measuring effective memory bandwidth for strided array access 3 | // 4 | // Author: Karl Rupp, me@karlrupp.net 5 | // License: MIT/X11 license, see file LICENSE.txt 6 | // 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "hip/hip_runtime.h" 15 | 16 | #include "benchmark-utils.hpp" 17 | 18 | 19 | inline void cuda_last_error_check() 20 | { 21 | hipError_t error_code = hipGetLastError(); 22 | 23 | if (hipSuccess != error_code) 24 | { 25 | std::stringstream ss; 26 | ss << "CUDA Runtime API error " << error_code << ": " << hipGetErrorString( error_code ) << std::endl; 27 | throw std::runtime_error(ss.str()); 28 | } 29 | } 30 | 31 | 32 | // Kernel for the benchmark 33 | template 34 | __global__ void elementwise_add(hipLaunchParm lp, 35 | const NumericT * x, 36 | const NumericT * y, 37 | NumericT * z, 38 | unsigned int stride, 39 | unsigned int size) 40 | { 41 | for (unsigned int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; 42 | i < size; 43 | i += hipGridDim_x * hipBlockDim_x) 44 | z[i*stride] = x[i*stride] + y[i*stride]; 45 | } 46 | 47 | 48 | int main(int argc, char **argv) 49 | { 50 | typedef float NumericT; 51 | 52 | hipDeviceProp_t prop; 53 | hipError_t err = hipGetDeviceProperties(&prop, 0); if (err != hipSuccess) throw std::runtime_error("Failed to get CUDA device name"); 54 | std::cout << "# Using device: " << prop.name << std::endl; 55 | 56 | // Set up work vectors 57 | std::size_t N = 1000000; 58 | 59 | std::vector host_x(32*N); 60 | NumericT *x, *y, *z; 61 | 62 | err = hipMalloc(&x, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for x"); 63 | err = hipMalloc(&y, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for y"); 64 | err = hipMalloc(&z, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for z"); 65 | 66 | 67 | // Warmup calculation: 68 | hipLaunchKernel(elementwise_add, dim3(256), dim3(256), 0, 0, x, y, z, 69 | static_cast(1), 70 | static_cast(N)); 71 | cuda_last_error_check(); 72 | 73 | // Benchmark runs 74 | Timer timer; 75 | std::cout << "# stride time GB/sec" << std::endl; 76 | for (std::size_t stride = 1; stride <= 32 ; ++stride) 77 | { 78 | hipDeviceSynchronize(); 79 | timer.start(); 80 | 81 | // repeat calculation several times, then average 82 | for (std::size_t num_runs = 0; num_runs < 20; ++num_runs) 83 | { 84 | hipLaunchKernel(elementwise_add, dim3(256), dim3(256), 0, 0, x, y, z, 85 | static_cast(stride), 86 | static_cast(N)); 87 | cuda_last_error_check(); 88 | } 89 | hipDeviceSynchronize(); 90 | double exec_time = timer.get(); 91 | 92 | std::cout << " " << stride << " " << exec_time << " " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl; 93 | } 94 | 95 | return EXIT_SUCCESS; 96 | } 97 | 98 | -------------------------------------------------------------------------------- /strided-access/benchmark-opencl.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // OpenCL benchmark for measuring effective memory bandwidth for strided array access 3 | // 4 | // Author: Karl Rupp, me@karlrupp.net 5 | // License: MIT/X11 license, see file LICENSE.txt 6 | // 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "benchmark-utils.hpp" 14 | 15 | #ifdef __APPLE__ 16 | #include 17 | #else 18 | #include 19 | #endif 20 | 21 | // OpenCL error checking 22 | #define ERROR_CHECKER_CASE(ERRORCODE) case ERRORCODE: throw std::runtime_error("#ERRORCODE"); 23 | static void checkError(cl_int err) 24 | { 25 | if (err != CL_SUCCESS) 26 | { 27 | switch (err) 28 | { 29 | ERROR_CHECKER_CASE(CL_DEVICE_NOT_FOUND); 30 | ERROR_CHECKER_CASE(CL_DEVICE_NOT_AVAILABLE); 31 | ERROR_CHECKER_CASE(CL_COMPILER_NOT_AVAILABLE); 32 | ERROR_CHECKER_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); 33 | ERROR_CHECKER_CASE(CL_OUT_OF_RESOURCES); 34 | ERROR_CHECKER_CASE(CL_OUT_OF_HOST_MEMORY); 35 | ERROR_CHECKER_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); 36 | ERROR_CHECKER_CASE(CL_MEM_COPY_OVERLAP); 37 | ERROR_CHECKER_CASE(CL_IMAGE_FORMAT_MISMATCH); 38 | ERROR_CHECKER_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); 39 | ERROR_CHECKER_CASE(CL_BUILD_PROGRAM_FAILURE); 40 | ERROR_CHECKER_CASE(CL_MAP_FAILURE); 41 | 42 | ERROR_CHECKER_CASE(CL_INVALID_VALUE); 43 | ERROR_CHECKER_CASE(CL_INVALID_DEVICE_TYPE); 44 | ERROR_CHECKER_CASE(CL_INVALID_PLATFORM); 45 | ERROR_CHECKER_CASE(CL_INVALID_DEVICE); 46 | ERROR_CHECKER_CASE(CL_INVALID_CONTEXT); 47 | ERROR_CHECKER_CASE(CL_INVALID_QUEUE_PROPERTIES); 48 | ERROR_CHECKER_CASE(CL_INVALID_COMMAND_QUEUE); 49 | ERROR_CHECKER_CASE(CL_INVALID_HOST_PTR); 50 | ERROR_CHECKER_CASE(CL_INVALID_MEM_OBJECT); 51 | ERROR_CHECKER_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); 52 | ERROR_CHECKER_CASE(CL_INVALID_IMAGE_SIZE); 53 | ERROR_CHECKER_CASE(CL_INVALID_SAMPLER); 54 | ERROR_CHECKER_CASE(CL_INVALID_BINARY); 55 | ERROR_CHECKER_CASE(CL_INVALID_BUILD_OPTIONS); 56 | ERROR_CHECKER_CASE(CL_INVALID_PROGRAM); 57 | ERROR_CHECKER_CASE(CL_INVALID_PROGRAM_EXECUTABLE); 58 | ERROR_CHECKER_CASE(CL_INVALID_KERNEL_NAME); 59 | ERROR_CHECKER_CASE(CL_INVALID_KERNEL_DEFINITION); 60 | ERROR_CHECKER_CASE(CL_INVALID_KERNEL); 61 | ERROR_CHECKER_CASE(CL_INVALID_ARG_INDEX); 62 | ERROR_CHECKER_CASE(CL_INVALID_ARG_VALUE); 63 | ERROR_CHECKER_CASE(CL_INVALID_ARG_SIZE); 64 | ERROR_CHECKER_CASE(CL_INVALID_KERNEL_ARGS); 65 | ERROR_CHECKER_CASE(CL_INVALID_WORK_DIMENSION); 66 | ERROR_CHECKER_CASE(CL_INVALID_WORK_GROUP_SIZE); 67 | ERROR_CHECKER_CASE(CL_INVALID_WORK_ITEM_SIZE); 68 | ERROR_CHECKER_CASE(CL_INVALID_GLOBAL_OFFSET); 69 | ERROR_CHECKER_CASE(CL_INVALID_EVENT_WAIT_LIST); 70 | ERROR_CHECKER_CASE(CL_INVALID_EVENT); 71 | ERROR_CHECKER_CASE(CL_INVALID_OPERATION); 72 | ERROR_CHECKER_CASE(CL_INVALID_GL_OBJECT); 73 | ERROR_CHECKER_CASE(CL_INVALID_BUFFER_SIZE); 74 | ERROR_CHECKER_CASE(CL_INVALID_MIP_LEVEL); 75 | ERROR_CHECKER_CASE(CL_INVALID_GLOBAL_WORK_SIZE); 76 | 77 | default: throw std::runtime_error("Unknown error. Maybe OpenCL SDK not properly installed?"); 78 | } 79 | } 80 | } 81 | 82 | #define ERR_CHECK(err) checkError(err); 83 | 84 | 85 | 86 | // Kernel for the benchmark 87 | static const char * benchmark_program = 88 | "__kernel void elementwise_add(\n" 89 | " __global const float * x,\n" 90 | " __global const float * y, \n" 91 | " __global float * z,\n" 92 | " unsigned int stride,\n" 93 | " unsigned int size) \n" 94 | "{ \n" 95 | " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 96 | " z[i*stride] = x[i*stride] + y[i*stride];\n" 97 | "};\n"; 98 | 99 | int main(int argc, char **argv) 100 | { 101 | typedef float NumericT; 102 | 103 | /////////////////////////// Part 1: Initialize OpenCL /////////////////////////////////// 104 | 105 | // 106 | // Query platform: 107 | // 108 | cl_uint num_platforms; 109 | cl_platform_id platform_ids[42]; //no more than 42 platforms supported... 110 | cl_int err = clGetPlatformIDs(42, platform_ids, &num_platforms); ERR_CHECK(err); 111 | 112 | std::cout << "# Platforms found: " << num_platforms << std::endl; 113 | for (cl_uint i=0; i 1) 131 | { 132 | std::cout << "# Enter platform index to use: "; 133 | std::cin >> platform_index; 134 | platform_index = std::min(platform_index, num_platforms - 1); 135 | std::cout << "#" << std::endl; 136 | } 137 | 138 | // 139 | // Query devices: 140 | // 141 | cl_device_id device_ids[42]; 142 | cl_uint num_devices; 143 | err = clGetDeviceIDs(platform_ids[platform_index], CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices); ERR_CHECK(err); 144 | std::cout << "# Devices found: " << num_devices << std::endl; 145 | for (cl_uint i=0; i 1) 156 | { 157 | std::cout << "# Enter index of device to use: "; 158 | std::cin >> device_index; 159 | device_index = std::min(device_index, num_devices - 1); 160 | std::cout << "#" << std::endl; 161 | } 162 | 163 | // now set up a context containing the selected device: 164 | cl_context my_context = clCreateContext(0, 1, &(device_ids[device_index]), NULL, NULL, &err); ERR_CHECK(err); 165 | 166 | // create a command queue for the device: 167 | cl_command_queue queue = clCreateCommandQueue(my_context, device_ids[device_index], 0, &err); ERR_CHECK(err); 168 | 169 | 170 | cl_program my_program = clCreateProgramWithSource(my_context, 1, &benchmark_program, NULL, &err); ERR_CHECK(err); 171 | err = clBuildProgram(my_program, 0, NULL, NULL, NULL, NULL); 172 | if (err != CL_SUCCESS) 173 | { 174 | char buffer[8192]; 175 | cl_build_status status; 176 | std::cout << "Build Scalar: Err = " << err; 177 | err = clGetProgramBuildInfo(my_program, device_ids[device_index], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); ERR_CHECK(err); 178 | err = clGetProgramBuildInfo(my_program, device_ids[device_index], CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &buffer, NULL); ERR_CHECK(err); 179 | std::cout << " Status = " << status << std::endl; 180 | std::cout << "Log: " << buffer << std::endl; 181 | std::cout << "Sources: " << benchmark_program << std::endl; 182 | } 183 | cl_kernel my_kernel = clCreateKernel(my_program, "elementwise_add", &err); ERR_CHECK(err); 184 | 185 | /////////////////////////// Part 2: Run benchmark /////////////////////////////////// 186 | 187 | 188 | // Set up work vectors 189 | cl_uint N = 1000000; 190 | std::vector host_x(32*N); 191 | cl_mem x = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 32*N*sizeof(NumericT), &(host_x[0]), &err); ERR_CHECK(err); 192 | cl_mem y = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 32*N*sizeof(NumericT), &(host_x[0]), &err); ERR_CHECK(err); 193 | cl_mem z = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 32*N*sizeof(NumericT), &(host_x[0]), &err); ERR_CHECK(err); 194 | 195 | // Warmup calculation: 196 | size_t localsize = 256; 197 | size_t globalsize = 256 * localsize; 198 | cl_uint stride = 1; 199 | err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), &x); ERR_CHECK(err); 200 | err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), &y); ERR_CHECK(err); 201 | err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), &z); ERR_CHECK(err); 202 | err = clSetKernelArg(my_kernel, 3, sizeof(cl_uint), &stride); ERR_CHECK(err); 203 | err = clSetKernelArg(my_kernel, 4, sizeof(cl_uint), &N); ERR_CHECK(err); 204 | err = clEnqueueNDRangeKernel(queue, my_kernel, 1, NULL, &globalsize, &localsize, 0, NULL, NULL); ERR_CHECK(err); 205 | 206 | // Benchmark runs 207 | Timer timer; 208 | char device_name[1024]; 209 | err = clGetDeviceInfo(device_ids[device_index], CL_DEVICE_NAME, 1024, device_name, NULL); ERR_CHECK(err); 210 | std::cout << "# Using device: " << device_name << std::endl; 211 | std::cout << "# stride time GB/sec" << std::endl; 212 | for (; stride <= 32; ++stride) 213 | { 214 | err = clFinish(queue); ERR_CHECK(err); 215 | err = clSetKernelArg(my_kernel, 3, sizeof(cl_uint), &stride); ERR_CHECK(err); 216 | 217 | // repeat calculation several times, then average 218 | timer.start(); 219 | for (std::size_t num_runs = 0; num_runs < 20; ++num_runs) 220 | { 221 | err = clEnqueueNDRangeKernel(queue, my_kernel, 1, NULL, &globalsize, &localsize, 0, NULL, NULL); ERR_CHECK(err); 222 | } 223 | err = clFinish(queue); ERR_CHECK(err); 224 | double exec_time = timer.get(); 225 | 226 | std::cout << " " << stride << " " << exec_time << " " << 20.0 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl; 227 | } 228 | 229 | return EXIT_SUCCESS; 230 | } 231 | 232 | -------------------------------------------------------------------------------- /strided-access/benchmark-openmp.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // OpenMP benchmark for measuring effective memory bandwidth for strided array access 3 | // 4 | // Author: Karl Rupp, me@karlrupp.net 5 | // License: MIT/X11 license, see file LICENSE.txt 6 | // 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "benchmark-utils.hpp" 16 | 17 | typedef float NumericT; 18 | 19 | int kernel_func(NumericT *x, NumericT const *y, NumericT const *z, int stride, int N); 20 | 21 | int main(int argc, char **argv) 22 | { 23 | 24 | // slightly larger on CPU than on GPU so that arrays don't fit in cache 25 | std::size_t N = 5000000; 26 | 27 | // Note: Run only on a single NUMA domain. std::vector has terrible first-touch semantics 28 | NumericT *x; if (posix_memalign((void**)&x, 64, 32*N*sizeof(NumericT))) throw std::runtime_error("Failed to allocate x"); 29 | NumericT *y; if (posix_memalign((void**)&y, 64, 32*N*sizeof(NumericT))) throw std::runtime_error("Failed to allocate y"); 30 | NumericT *z; if (posix_memalign((void**)&z, 64, 32*N*sizeof(NumericT))) throw std::runtime_error("Failed to allocate z"); 31 | 32 | #pragma omp parallel for 33 | for (std::size_t i=0; i<32*N; ++i) 34 | { 35 | x[i] = 1.0; 36 | y[i] = 2.0; 37 | z[i] = 3.0; 38 | } 39 | 40 | 41 | // warmup: 42 | kernel_func(&x[0], &y[0], &z[0], 1, N); 43 | 44 | 45 | // Benchmark runs 46 | Timer timer; 47 | std::cout << "# stride time GB/sec" << std::endl; 48 | for (std::size_t stride = 1; stride <= 32 ; ++stride) 49 | { 50 | timer.start(); 51 | 52 | // repeat calculation several times, then average 53 | for (std::size_t num_runs = 0; num_runs < 20; ++num_runs) 54 | { 55 | kernel_func(&x[0], &y[0], &z[0], stride, N); 56 | } 57 | double exec_time = timer.get(); 58 | 59 | std::cout << " " << stride << " " << exec_time << " " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl; 60 | } 61 | 62 | return EXIT_SUCCESS; 63 | } 64 | 65 | -------------------------------------------------------------------------------- /strided-access/benchmark-openmp2.cpp: -------------------------------------------------------------------------------- 1 | 2 | typedef float NumericT; 3 | 4 | int kernel_func(NumericT *x, NumericT const *y, NumericT const *z, int stride, int N) 5 | { 6 | if (stride == 1) 7 | { 8 | #pragma omp parallel for 9 | for (int i=0; i 11 | 12 | #ifdef _WIN32 13 | 14 | #define WINDOWS_LEAN_AND_MEAN 15 | #include 16 | #undef min 17 | #undef max 18 | 19 | class Timer 20 | { 21 | public: 22 | 23 | Timer() { QueryPerformanceFrequency(&freq); } 24 | 25 | void start() { QueryPerformanceCounter((LARGE_INTEGER*) &start_time); } 26 | 27 | double get() const 28 | { 29 | LARGE_INTEGER end_time; 30 | QueryPerformanceCounter((LARGE_INTEGER*) &end_time); 31 | return (static_cast(end_time.QuadPart) - static_cast(start_time.QuadPart)) / static_cast(freq.QuadPart); 32 | } 33 | 34 | private: 35 | LARGE_INTEGER freq; 36 | LARGE_INTEGER start_time; 37 | }; 38 | 39 | #else 40 | 41 | #include 42 | 43 | class Timer 44 | { 45 | public: 46 | 47 | Timer() : ts(0) 48 | {} 49 | 50 | void start() 51 | { 52 | struct timeval tval; 53 | gettimeofday(&tval, NULL); 54 | ts = tval.tv_sec * 1000000 + tval.tv_usec; 55 | } 56 | 57 | double get() const 58 | { 59 | struct timeval tval; 60 | gettimeofday(&tval, NULL); 61 | double end_time = tval.tv_sec * 1000000 + tval.tv_usec; 62 | 63 | return static_cast(end_time-ts) / 1000000.0; 64 | } 65 | 66 | private: 67 | double ts; 68 | }; 69 | 70 | 71 | #endif 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /strided-access/results/LICENSE.txt: -------------------------------------------------------------------------------- 1 | All data in this archive is available under the following creative commons license: 2 | 3 | Attribution 4.0 International (CC BY 4.0) 4 | http://creativecommons.org/licenses/by/4.0/ 5 | 6 | You are free to: 7 | 8 | Share — copy and redistribute the material in any medium or format 9 | Adapt — remix, transform, and build upon the material 10 | for any purpose, even commercially. 11 | 12 | The licensor cannot revoke these freedoms as long as you follow the license terms. 13 | 14 | Under the following terms: 15 | 16 | Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. 17 | 18 | No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits. 19 | 20 | Notices: 21 | 22 | You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation. 23 | No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material. 24 | 25 | -------------------------------------------------------------------------------- /strided-access/results/k20m.txt: -------------------------------------------------------------------------------- 1 | # Using device: Tesla K20m 2 | # stride time GB/sec 3 | 1 0.00186 129.032 4 | 2 0.005073 47.3093 5 | 3 0.006904 34.7625 6 | 4 0.009935 24.157 7 | 5 0.012335 19.4568 8 | 6 0.013922 17.2389 9 | 7 0.017785 13.4945 10 | 8 0.018577 12.9192 11 | 9 0.019592 12.2499 12 | 10 0.01994 12.0361 13 | 11 0.020641 11.6273 14 | 12 0.02157 11.1266 15 | 13 0.022427 10.7014 16 | 14 0.02317 10.3582 17 | 15 0.024805 9.67547 18 | 16 0.025235 9.5106 19 | 17 0.026862 8.93455 20 | 18 0.028229 8.5019 21 | 19 0.029781 8.05883 22 | 20 0.032167 7.46106 23 | 21 0.032969 7.27957 24 | 22 0.034717 6.91304 25 | 23 0.036391 6.59504 26 | 24 0.037984 6.31845 27 | 25 0.040361 5.94633 28 | 26 0.041379 5.80004 29 | 27 0.043162 5.56045 30 | 28 0.044556 5.38648 31 | 29 0.046384 5.1742 32 | 30 0.048085 4.99116 33 | 31 0.049665 4.83238 34 | 32 0.050811 4.72339 35 | 36 | -------------------------------------------------------------------------------- /strided-access/results/plot.gnuplot: -------------------------------------------------------------------------------- 1 | set terminal postscript enhanced color eps 2 | 3 | set style data lines 4 | set style line 1 linetype -1 linewidth 3 lc rgb "#AA0000" 5 | set style line 2 linetype -1 linewidth 3 lc rgb "#0000AA" 6 | set style line 3 linetype -1 linewidth 3 lc rgb "#000000" 7 | set style line 4 linetype -1 linewidth 3 lc rgb "#00AA00" 8 | set style line 5 linetype 2 linewidth 3 lc rgb "#00AA00" 9 | set style line 6 linetype -1 linewidth 3 lc rgb "#00AA00" 10 | set style line 7 linetype 2 linewidth 3 lc rgb "#000000" 11 | set style line 8 linetype -1 linewidth 3 lc rgb "#000000" 12 | set style increment user 13 | 14 | set size 0.75,0.75 15 | #set size ratio 0.66 16 | set border lw 2 17 | 18 | set key top right Right 19 | set grid 20 | set logscale y 21 | set xrange [1:16] 22 | 23 | ####### 24 | 25 | set output "strided-access.eps" 26 | set title "Memory Bandwidth for Strided Array Access\n{/*0.7 x[i*stride] = y[i*stride] + z[i*stride]}" 27 | set ylabel "Memory Bandwidth (GB/sec)" 28 | set xlabel "Stride (4 Bytes per Element)" 29 | plot 'w9100.txt' using 1:3 with linesp ls 1 pt 5 ps 1.5 title "AMD FirePro W9100", \ 30 | 'xeon-e5-2670v3.txt' using 1:3 with linesp ls 2 pt 7 ps 1.5 title "1x INTEL Xeon E5-2670v3", \ 31 | 'xeon-phi-7120.txt' using 1:3 with linesp ls 3 pt 9 ps 2 title "INTEL Xeon Phi 7120", \ 32 | 'k20m.txt' using 1:3 with linesp ls 4 pt 11 ps 2 title "NVIDIA Tesla K20m" 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /strided-access/results/strided-access.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/strided-access/results/strided-access.pdf -------------------------------------------------------------------------------- /strided-access/results/strided-access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/strided-access/results/strided-access.png -------------------------------------------------------------------------------- /strided-access/results/w9100.txt: -------------------------------------------------------------------------------- 1 | # Platforms found: 1 2 | # (0) Advanced Micro Devices, Inc.: OpenCL 1.2 AMD-APP (1573.4) 3 | # Devices found: 2 4 | # (0) Hawaii 5 | # (1) AMD Phenom(tm) II X4 955 Processor 6 | # Enter index of device to use: 0 7 | # 8 | # Using device: Hawaii 9 | # stride time GB/sec 10 | 1 0.00109 220.183 11 | 2 0.002025 118.519 12 | 3 0.003026 79.3126 13 | 4 0.003911 61.3654 14 | 5 0.00515 46.6019 15 | 6 0.006224 38.5604 16 | 7 0.007256 33.0761 17 | 8 0.007685 31.2297 18 | 9 0.009201 26.0841 19 | 10 0.010038 23.9091 20 | 11 0.010782 22.2593 21 | 12 0.011546 20.7864 22 | 13 0.012666 18.9484 23 | 14 0.013456 17.8359 24 | 15 0.014238 16.8563 25 | 16 0.014342 16.7341 26 | 17 0.015833 15.1582 27 | 18 0.016072 14.9328 28 | 19 0.016318 14.7077 29 | 20 0.016113 14.8948 30 | 21 0.016611 14.4483 31 | 22 0.016743 14.3343 32 | 23 0.016828 14.2619 33 | 24 0.016698 14.373 34 | 25 0.01691 14.1928 35 | 26 0.017249 13.9139 36 | 27 0.017067 14.0622 37 | 28 0.016928 14.1777 38 | 29 0.017321 13.856 39 | 30 0.016969 14.1434 40 | 31 0.016771 14.3104 41 | 32 0.015675 15.311 42 | 43 | -------------------------------------------------------------------------------- /strided-access/results/xeon-e5-2670v3.txt: -------------------------------------------------------------------------------- 1 | # stride time GB/sec 2 | 1 0.042428 28.2832 3 | 2 0.120377 9.96868 4 | 3 0.168107 7.13831 5 | 4 0.235153 5.10306 6 | 5 0.311109 3.85717 7 | 6 0.355833 3.37237 8 | 7 0.431296 2.78231 9 | 8 0.502569 2.38773 10 | 9 0.549516 2.18374 11 | 10 0.624792 1.92064 12 | 11 0.673498 1.78174 13 | 12 0.746097 1.60837 14 | 13 0.820617 1.46231 15 | 14 0.861678 1.39263 16 | 15 0.936427 1.28147 17 | 16 1.00677 1.19193 18 | 17 1.03629 1.15798 19 | 18 1.09355 1.09734 20 | 19 1.12504 1.06663 21 | 20 1.17261 1.02336 22 | 21 1.22663 0.978291 23 | 22 1.249 0.960766 24 | 23 1.3017 0.921874 25 | 24 1.35292 0.88697 26 | 25 1.37572 0.872272 27 | 26 1.41969 0.845256 28 | 27 1.45273 0.826029 29 | 28 1.47944 0.811119 30 | 29 1.52076 0.789082 31 | 30 1.50888 0.795293 32 | 31 1.53889 0.779783 33 | 32 1.67142 0.71795 34 | 35 | -------------------------------------------------------------------------------- /strided-access/results/xeon-phi-7120.txt: -------------------------------------------------------------------------------- 1 | # stride time GB/sec 2 | 1 0.013312 90.1442 3 | 2 0.033645 35.6665 4 | 3 0.049236 24.3724 5 | 4 0.055954 21.4462 6 | 5 0.063018 19.0422 7 | 6 0.073359 16.3579 8 | 7 0.083043 14.4503 9 | 8 0.092531 12.9686 10 | 9 0.103568 11.5866 11 | 10 0.116298 10.3183 12 | 11 0.128292 9.35366 13 | 12 0.139058 8.62949 14 | 13 0.150226 7.98796 15 | 14 0.161199 7.44421 16 | 15 0.170664 7.03136 17 | 16 0.180439 6.65045 18 | 17 0.18679 6.42433 19 | 18 0.193852 6.19029 20 | 19 0.200235 5.99296 21 | 20 0.207103 5.79422 22 | 21 0.211277 5.67975 23 | 22 0.217728 5.51146 24 | 23 0.223153 5.37748 25 | 24 0.231584 5.18171 26 | 25 0.23512 5.10378 27 | 26 0.241932 4.96007 28 | 27 0.24567 4.8846 29 | 28 0.251481 4.77173 30 | 29 0.256766 4.67352 31 | 30 0.261103 4.59589 32 | 31 0.268795 4.46437 33 | 32 0.270861 4.43032 34 | 35 | -------------------------------------------------------------------------------- /test_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : ${HIP_PLATFORM:="hcc"} 4 | 5 | # vector_add 6 | echo 7 | echo "==== vectorAdd ====" 8 | cd vectorAdd 9 | make clean 10 | make 11 | cd .. 12 | 13 | # gpu-burn 14 | echo 15 | echo "==== gpu-burn ====" 16 | cd gpu-burn 17 | make clean 18 | make 19 | ./build/gpuburn-hip -t 5 20 | cd .. 21 | 22 | # strided-access 23 | echo 24 | echo "==== strided-access ====" 25 | cd strided-access 26 | make clean 27 | make 28 | ./strided-access 29 | cd .. 30 | 31 | 32 | # rtm8 33 | echo 34 | echo "==== rtm8 ====" 35 | cd rtm8 36 | ./build_hip.sh 37 | ./rtm8_hip 38 | cd .. 39 | 40 | # reduction 41 | echo 42 | echo "==== reduction ====" 43 | cd reduction 44 | make clean 45 | make 46 | ./run.sh 47 | cd .. 48 | 49 | # mini-nbody 50 | echo 51 | echo "==== mini-nbody ====" 52 | cd mini-nbody/hip 53 | ./HIP-nbody-orig.sh 54 | ./HIP-nbody-soa.sh 55 | ./HIP-nbody-block.sh 56 | cd ../.. 57 | 58 | # add4 59 | echo 60 | echo "==== add4 ====" 61 | cd add4 62 | ./buildit.sh 63 | ./runhip.sh 64 | cd .. 65 | 66 | # cuda-stream 67 | echo 68 | echo "==== cuda-stream ====" 69 | cd cuda-stream 70 | make clean 71 | make 72 | ./stream 73 | cd .. 74 | 75 | # openmp-helloworld 76 | echo 77 | echo "==== OpenMP Hello World ====" 78 | cd openmp-helloworld 79 | mkdir -p build 80 | cd build 81 | cmake .. 82 | make 83 | ./test_openmp_helloworld 84 | cd ../.. 85 | 86 | -------------------------------------------------------------------------------- /vectorAdd/Makefile: -------------------------------------------------------------------------------- 1 | HIP_PATH?= $(wildcard /opt/rocm) 2 | HIPCC=$(HIP_PATH)/bin/hipcc 3 | 4 | SOURCES = vectoradd_hip.cpp 5 | OBJECTS = $(SOURCES:.cpp=.o) 6 | 7 | EXECUTABLE=./vectoradd_hip.exe 8 | 9 | .PHONY: test 10 | 11 | 12 | all: $(EXECUTABLE) test 13 | 14 | CXXFLAGS =-g 15 | 16 | CXX=$(HIPCC) 17 | 18 | 19 | $(EXECUTABLE): $(OBJECTS) 20 | $(HIPCC) $(OBJECTS) -o $@ 21 | 22 | 23 | test: $(EXECUTABLE) 24 | $(EXECUTABLE) 25 | 26 | 27 | clean: 28 | rm -f $(EXECUTABLE) 29 | rm -f $(OBJECTS) 30 | rm -f $(HIP_PATH)/src/*.o 31 | -------------------------------------------------------------------------------- /vectorAdd/README: -------------------------------------------------------------------------------- 1 | Simple vectorAdd example written directly to the HIP interface. 2 | -------------------------------------------------------------------------------- /vectorAdd/vectoradd_hip.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "hip/hip_runtime.h" 28 | 29 | 30 | #ifdef NDEBUG 31 | #define HIP_ASSERT(x) x 32 | #else 33 | #define HIP_ASSERT(x) (assert((x)==hipSuccess)) 34 | #endif 35 | 36 | 37 | #define WIDTH 1024 38 | #define HEIGHT 1024 39 | 40 | #define NUM (WIDTH*HEIGHT) 41 | 42 | #define THREADS_PER_BLOCK_X 16 43 | #define THREADS_PER_BLOCK_Y 16 44 | #define THREADS_PER_BLOCK_Z 1 45 | 46 | __global__ void 47 | vectoradd_float(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, int width, int height) 48 | 49 | { 50 | 51 | int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; 52 | int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; 53 | 54 | int i = y * width + x; 55 | if ( i < (width * height)) { 56 | a[i] = b[i] + c[i]; 57 | } 58 | 59 | 60 | 61 | } 62 | 63 | #if 0 64 | __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { 65 | 66 | 67 | int x = blockDimX * blockIdx.x + threadIdx.x; 68 | int y = blockDimY * blockIdy.y + threadIdx.y; 69 | 70 | int i = y * width + x; 71 | if ( i < (width * height)) { 72 | a[i] = b[i] + c[i]; 73 | } 74 | } 75 | #endif 76 | 77 | using namespace std; 78 | 79 | int main() { 80 | 81 | float* hostA; 82 | float* hostB; 83 | float* hostC; 84 | 85 | float* deviceA; 86 | float* deviceB; 87 | float* deviceC; 88 | 89 | hipDeviceProp_t devProp; 90 | hipGetDeviceProperties(&devProp, 0); 91 | cout << " System minor " << devProp.minor << endl; 92 | cout << " System major " << devProp.major << endl; 93 | cout << " agent prop name " << devProp.name << endl; 94 | 95 | 96 | 97 | cout << "hip Device prop succeeded " << endl ; 98 | 99 | 100 | int i; 101 | int errors; 102 | 103 | hostA = (float*)malloc(NUM * sizeof(float)); 104 | hostB = (float*)malloc(NUM * sizeof(float)); 105 | hostC = (float*)malloc(NUM * sizeof(float)); 106 | 107 | // initialize the input data 108 | for (i = 0; i < NUM; i++) { 109 | hostB[i] = (float)i; 110 | hostC[i] = (float)i*100.0f; 111 | } 112 | 113 | HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(float))); 114 | HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(float))); 115 | HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(float))); 116 | 117 | HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(float), hipMemcpyHostToDevice)); 118 | HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(float), hipMemcpyHostToDevice)); 119 | 120 | 121 | hipLaunchKernelGGL(vectoradd_float, 122 | dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), 123 | dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 124 | 0, 0, 125 | deviceA ,deviceB ,deviceC ,WIDTH ,HEIGHT); 126 | 127 | 128 | HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(float), hipMemcpyDeviceToHost)); 129 | 130 | // verify the results 131 | errors = 0; 132 | for (i = 0; i < NUM; i++) { 133 | if (hostA[i] != (hostB[i] + hostC[i])) { 134 | errors++; 135 | } 136 | } 137 | if (errors!=0) { 138 | printf("FAILED: %d errors\n",errors); 139 | } else { 140 | printf ("PASSED!\n"); 141 | } 142 | 143 | HIP_ASSERT(hipFree(deviceA)); 144 | HIP_ASSERT(hipFree(deviceB)); 145 | HIP_ASSERT(hipFree(deviceC)); 146 | 147 | free(hostA); 148 | free(hostB); 149 | free(hostC); 150 | 151 | //hipResetDefaultAccelerator(); 152 | 153 | return errors; 154 | } 155 | --------------------------------------------------------------------------------