├── .gitignore
├── .gitmodules
├── HIP-Examples-Applications
    ├── BinomialOption
    │   ├── BinomialOption.cpp
    │   └── Makefile
    ├── BitonicSort
    │   ├── BitonicSort.cpp
    │   └── Makefile
    ├── FastWalshTransform
    │   ├── FastWalshTransform.cpp
    │   └── Makefile
    ├── FloydWarshall
    │   ├── FloydWarshall.cpp
    │   └── Makefile
    ├── HelloWorld
    │   ├── HelloWorld.cpp
    │   └── Makefile
    ├── Histogram
    │   ├── Histogram.cpp
    │   ├── Histogram.hpp
    │   └── Makefile
    ├── MatrixMultiplication
    │   ├── Makefile
    │   └── MatrixMultiplication.cpp
    ├── PrefixSum
    │   ├── Makefile
    │   └── PrefixSum.cpp
    ├── RecursiveGaussian
    │   ├── Makefile
    │   ├── RecursiveGaussian.cpp
    │   ├── RecursiveGaussian.hpp
    │   ├── RecursiveGaussian_Input.bmp
    │   └── RecursiveGaussian_Output.bmp
    ├── SimpleConvolution
    │   ├── FilterCoeff.h
    │   ├── Makefile
    │   ├── SimpleConvolution.cpp
    │   └── SimpleConvolution.hpp
    ├── dct
    │   ├── Makefile
    │   └── dct.cpp
    ├── dwtHaar1D
    │   ├── Makefile
    │   └── dwtHaar1D.cpp
    └── include
    │   ├── HIPUtil.hpp
    │   ├── SDKBitMap.hpp
    │   ├── SDKFile.hpp
    │   ├── SDKThread.hpp
    │   └── SDKUtil.hpp
├── README.md
├── add4
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── buildit.sh
    ├── common.cpp
    ├── common.h
    ├── hip-stream.cpp
    ├── run_sweep.pl
    └── runhip.sh
├── common
    ├── hip.all.make
    └── hip.prologue.make
├── cuda-stream
    ├── Makefile
    ├── Makefile.titan
    ├── README.md
    └── stream.cpp
├── gpu-burn
    ├── AmdGpuMonitor.cpp
    ├── AmdGpuMonitor.h
    ├── BurnKernel.cpp
    ├── BurnKernel.h
    ├── GpuMonitor.h
    ├── Makefile
    ├── common.cpp
    ├── common.h
    └── gpuburn.cpp
├── mini-nbody
    ├── LICENSE
    ├── README.md
    ├── cuda
    │   ├── nbody-block.cu
    │   ├── nbody-orig.cu
    │   ├── nbody-soa.cu
    │   ├── nbody-unroll.cu
    │   ├── shmoo-cuda-nbody-block.sh
    │   ├── shmoo-cuda-nbody-ftz.sh
    │   ├── shmoo-cuda-nbody-orig.sh
    │   ├── shmoo-cuda-nbody-soa.sh
    │   └── shmoo-cuda-nbody-unroll.sh
    ├── hip
    │   ├── HIP-nbody-block.sh
    │   ├── HIP-nbody-orig.sh
    │   ├── HIP-nbody-soa.sh
    │   ├── nbody-block.cpp
    │   ├── nbody-orig.cpp
    │   └── nbody-soa.cpp
    ├── mic
    │   ├── nbody-align.c
    │   ├── nbody-block.c
    │   ├── nbody-soa.c
    │   ├── shmoo-mic-nbody-align.sh
    │   ├── shmoo-mic-nbody-block.sh
    │   ├── shmoo-mic-nbody-ftz.sh
    │   ├── shmoo-mic-nbody-orig.sh
    │   └── shmoo-mic-nbody-soa.sh
    ├── nbody.c
    ├── shmoo-cpu-nbody.sh
    └── timer.h
├── openmp-helloworld
    ├── CMakeLists.txt
    ├── Makefile
    ├── README.md
    └── openmp_helloworld.cpp
├── reduction
    ├── Makefile
    ├── README.md
    ├── reduction.cpp
    └── run.sh
├── rtm8
    ├── Makefile
    ├── README.md
    ├── build_cuda.sh
    ├── build_fortran.sh
    ├── build_hip.sh
    ├── mysecond.c
    ├── rtm8.cpp
    ├── rtm8.cu
    └── rtm8.f
├── strided-access
    ├── CL
    │   ├── cl.h
    │   ├── cl.hpp
    │   ├── cl_d3d10.h
    │   ├── cl_ext.h
    │   ├── cl_gl.h
    │   ├── cl_gl_ext.h
    │   ├── cl_platform.h
    │   └── opencl.h
    ├── LICENSE.txt
    ├── Makefile
    ├── README.txt
    ├── benchmark-cuda.cu
    ├── benchmark-hip.cpp
    ├── benchmark-hip.cu
    ├── benchmark-opencl.cpp
    ├── benchmark-openmp.cpp
    ├── benchmark-openmp2.cpp
    ├── benchmark-utils.hpp
    └── results
    │   ├── LICENSE.txt
    │   ├── k20m.txt
    │   ├── plot.gnuplot
    │   ├── strided-access.eps
    │   ├── strided-access.pdf
    │   ├── strided-access.png
    │   ├── w9100.txt
    │   ├── xeon-e5-2670v3.txt
    │   └── xeon-phi-7120.txt
├── test_all.sh
└── vectorAdd
    ├── Makefile
    ├── README
    └── vectoradd_hip.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | 
3 | vectorAdd/vectoradd_hip.exe
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "mixbench"]
2 | 	path = mixbench
3 | 	url = https://github.com/ekondis/mixbench.git
4 | [submodule "GPU-STREAM"]
5 | 	path = GPU-STREAM
6 | 	url = https://github.com/UoB-HPC/GPU-STREAM.git
7 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/BinomialOption/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = BinomialOption.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./BinomialOption
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/BitonicSort/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = BitonicSort.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./BitonicSort
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/FastWalshTransform/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = FastWalshTransform.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./FastWalshTransform
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/FloydWarshall/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = FloydWarshall.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./FloydWarshall
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/HelloWorld/HelloWorld.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #include <hip/hip_runtime.h>
24 | #include <string.h>
25 | #include <stdio.h>
26 | #include <stdlib.h>
27 | #include <iostream>
28 | #include <string>
29 | #include <fstream>
30 | 
31 | #define SAMPLE_VERSION "HIP-Examples-Application-v1.0"
32 | #define SUCCESS 0
33 | #define FAILURE 1
34 | 
35 | using namespace std;
36 | 
37 | __global__ void helloworld(char* in, char* out)
38 | {
39 | 	int num = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
40 | 	out[num] = in[num] + 1;
41 | }
42 | 
43 | int main(int argc, char* argv[])
44 | {
45 | 
46 |     hipDeviceProp_t devProp;
47 |     hipGetDeviceProperties(&devProp, 0);
48 |     cout << " System minor " << devProp.minor << endl;
49 |     cout << " System major " << devProp.major << endl;
50 |     cout << " agent prop name " << devProp.name << endl;
51 | 
52 | 	/* Initial input,output for the host and create memory objects for the kernel*/
53 | 	const char* input = "GdkknVnqkc";
54 | 	size_t strlength = strlen(input);
55 | 	cout << "input string:" << endl;
56 | 	cout << input << endl;
57 | 	char *output = (char*) malloc(strlength + 1);
58 | 
59 | 	char* inputBuffer;
60 | 	char* outputBuffer;
61 | 	hipMalloc((void**)&inputBuffer, (strlength + 1) * sizeof(char));
62 |     hipMalloc((void**)&outputBuffer, (strlength + 1) * sizeof(char));
63 | 
64 |     hipMemcpy(inputBuffer, input, (strlength + 1) * sizeof(char), hipMemcpyHostToDevice);
65 | 
66 | 	hipLaunchKernelGGL(helloworld,
67 |                   dim3(1),
68 |                   dim3(strlength),
69 |                   0, 0,
70 |                   inputBuffer ,outputBuffer );
71 | 
72 | 	hipMemcpy(output, outputBuffer,(strlength + 1) * sizeof(char), hipMemcpyDeviceToHost);
73 | 
74 |     hipFree(inputBuffer);
75 |     hipFree(outputBuffer);
76 | 
77 | 	output[strlength] = '\0';	//Add the terminal character to the end of output.
78 | 	cout << "\noutput string:" << endl;
79 | 	cout << output << endl;
80 | 
81 | 	free(output);
82 | 
83 | 	std::cout<<"Passed!\n";
84 | 	return SUCCESS;
85 | }
86 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/HelloWorld/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = HelloWorld.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./HelloWorld
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/Histogram/Histogram.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/Histogram/Histogram.cpp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/Histogram/Histogram.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/Histogram/Histogram.hpp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/Histogram/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = Histogram.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./Histogram
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/MatrixMultiplication/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = MatrixMultiplication.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./MatrixMultiplication
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/PrefixSum/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = PrefixSum.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./PrefixSum
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/RecursiveGaussian/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = RecursiveGaussian.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./RecursiveGaussian
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | 
 23 | #ifndef RECURSIVE_GAUSSIAN_H_
 24 | #define RECURSIVE_GAUSSIAN_H_
 25 | 
 26 | #include <hip/hip_runtime.h>
 27 | #include <stdio.h>
 28 | #include <stdlib.h>
 29 | #include <assert.h>
 30 | #include <string.h>
 31 | #include "../include/HIPUtil.hpp"
 32 | #include "../include/SDKBitMap.hpp"
 33 | 
 34 | using namespace appsdk;
 35 | using namespace std;
 36 | 
 37 | #define INPUT_IMAGE "RecursiveGaussian_Input.bmp"
 38 | #define OUTPUT_IMAGE "RecursiveGaussian_Output.bmp"
 39 | 
 40 | #define SAMPLE_VERSION "HIP-Examples-Applications-v1.0"
 41 | 
 42 | #define GROUP_SIZE 256
 43 | 
 44 | /**
 45 | * Custom type for gaussian parameters
 46 | * precomputation
 47 | */
 48 | typedef struct _GaussParms
 49 | {
 50 |     float nsigma;
 51 |     float alpha;
 52 |     float ema;
 53 |     float ema2;
 54 |     float b1;
 55 |     float b2;
 56 |     float a0;
 57 |     float a1;
 58 |     float a2;
 59 |     float a3;
 60 |     float coefp;
 61 |     float coefn;
 62 | } GaussParms, *pGaussParms;
 63 | 
 64 | 
 65 | 
 66 | /**
 67 | * Recursive Gaussian
 68 | * Class implements OpenRecursive Gaussian sample
 69 | */
 70 | 
 71 | class RecursiveGaussian
 72 | {
 73 |         double setupTime;                /**< time taken to setup Openresources and building kernel */
 74 |         double kernelTime;               /**< time taken to run kernel and read result back */
 75 | 
 76 |         uchar4* inputImageData;          /**< Input bitmap data to device */
 77 |         uchar4* outputImageData;         /**< Output from device */
 78 | 
 79 |         uchar4* inputImageBuffer;        /**< memory buffer for input Image*/
 80 |         uchar4* tempImageBuffer;         /**< memory buffer for storing the transpose of the image*/
 81 |         uchar4* outputImageBuffer;       /**< memory buffer for Output Image*/
 82 |         uchar4*
 83 |         verificationInput;               /**< Input array for reference implementation */
 84 |         uchar4*
 85 |         verificationOutput;              /**< Output array for reference implementation */
 86 | 
 87 |         SDKBitMap inputBitmap;           /**< Bitmap class object */
 88 |         uchar4* pixelData;               /**< Pointer to image data */
 89 |         unsigned int pixelSize;          /**< Size of a pixel in BMP format> */
 90 |         GaussParms
 91 |         oclGP;                           /**< instance of struct to hold gaussian parameters */
 92 |         unsigned int width;              /**< Width of image */
 93 |         unsigned int height;             /**< Height of image */
 94 |         size_t blockSizeX;               /**< Work-group size in x-direction */
 95 |         size_t blockSizeY;               /**< Work-group size in y-direction */
 96 |         size_t blockSize;                /**< block size for transpose kernel */
 97 |         int iterations;                  /**< Number of iterations for kernel execution */
 98 |         //uchar4 *din, *dout, *dtemp;
 99 | 
100 |         SDKTimer *sampleTimer;           /**< SDKTimer object */
101 | 
102 |     public:
103 | 
104 |         HIPCommandArgs   *sampleArgs;   /**< HIPCommand argument class */
105 | 
106 |         /**
107 |         * Read bitmap image and allocate host memory
108 |         * @param inputImageName name of the input file
109 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
110 |         */
111 |         int readInputImage(std::string inputImageName);
112 | 
113 |         /**
114 |         * Write output to an image file
115 |         * @param outputImageName name of the output file
116 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
117 |         */
118 |         int writeOutputImage(std::string outputImageName);
119 | 
120 |         /**
121 |         * Preprocess gaussian parameters
122 |         * @param fSigma sigma value
123 |         * @param iOrder order
124 |         * @param pGp pointer to gaussian parameter object
125 |         */
126 |         void computeGaussParms(float fSigma, int iOrder, GaussParms* pGP);
127 | 
128 |         /**
129 |         * RecursiveGaussian on CPU (for verification)
130 |         * @param input input image
131 |         * @param output output image
132 |         * @param width width of image
133 |         * @param height height of image
134 |         * @param a0..a3, b1, b2, coefp, coefn gaussian parameters
135 |         */
136 |         void recursiveGaussianCPU(uchar4* input, uchar4* output,
137 |                                   const int width, const int height,
138 |                                   const float a0, const float a1,
139 |                                   const float a2, const float a3,
140 |                                   const float b1, const float b2,
141 |                                   const float coefp, const float coefn);
142 | 
143 |         /**
144 |         * Transpose on CPU (for verification)
145 |         * @param input input image
146 |         * @param output output image
147 |         * @param width width of input image
148 |         * @param height height of input image
149 |         */
150 |         void transposeCPU(uchar4* input, uchar4* output,
151 |                           const int width, const int height);
152 | 
153 |         /**
154 |         * Constructor
155 |         * Initialize member variables
156 |         */
157 |         RecursiveGaussian()
158 |             : inputImageData(NULL),
159 |               outputImageData(NULL),
160 |               verificationOutput(NULL)
161 |         {
162 |             sampleArgs = new HIPCommandArgs();
163 |             sampleTimer = new SDKTimer();
164 |             sampleArgs->sampleVerStr = SAMPLE_VERSION;
165 |             pixelSize = sizeof(uchar4);
166 |             pixelData = NULL;
167 |             blockSizeX = GROUP_SIZE;
168 |             blockSizeY = 1;
169 |             blockSize = 1;
170 |             iterations = 1;
171 |         }
172 | 
173 |         ~RecursiveGaussian()
174 |         {
175 |         }
176 | 
177 |         inline long long get_time()
178 |         {
179 |           struct timeval tv;
180 |           gettimeofday(&tv, 0);
181 |           return (tv.tv_sec * 1000000) + tv.tv_usec;
182 |         }
183 | 
184 |         /**
185 |         * Allocate image memory and Load bitmap file
186 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
187 |         */
188 |         int setupRecursiveGaussian();
189 | 
190 |         /**
191 |         * Openrelated initialisations.
192 |         * Set up Context, Device list, Command Queue, Memory buffers
193 |         * Build kernel program executable
194 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
195 |         */
196 |         int setupHIP();
197 | 
198 |         /**
199 |         * Set values for kernels' arguments, enqueue calls to the kernels
200 |         * on to the command queue, wait till end of kernel execution.
201 |         * Get kernel start and end time if timing is enabled
202 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
203 |         */
204 |         int runKernels();
205 | 
206 |         /**
207 |         * Reference CPU implementation of Binomial Option
208 |         * for performance comparison
209 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
210 |         */
211 |         void recursiveGaussianCPUReference();
212 | 
213 |         /**
214 |         * Override from SDKSample. Print sample stats.
215 |         */
216 |         void printStats();
217 | 
218 |         /**
219 |         * Override from SDKSample. Initialize
220 |         * command line parser, add custom options
221 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
222 |         */
223 |         int initialize();
224 | 
225 |         /**
226 |         * Override from SDKSample, adjust width and height
227 |         * of execution domain, perform all sample setup
228 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
229 |         */
230 |         int setup();
231 | 
232 |         /**
233 |         * Override from SDKSample
234 |         * Run OpenSobel Filter
235 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
236 |         */
237 |         int run();
238 | 
239 |         /**
240 |         * Override from SDKSample
241 |         * Cleanup memory allocations
242 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
243 |         */
244 |         int cleanup();
245 | 
246 |         /**
247 |         * Override from SDKSample
248 |         * Verify against reference implementation
249 |         * @return SDK_SUCCESS on success and SDK_FAILURE on failure
250 |         */
251 |         int verifyResults();
252 | };
253 | 
254 | #endif // RECURSIVE_GAUSSIAN_H_
255 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Input.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Input.bmp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Output.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/RecursiveGaussian/RecursiveGaussian_Output.bmp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/SimpleConvolution/FilterCoeff.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/SimpleConvolution/FilterCoeff.h


--------------------------------------------------------------------------------
/HIP-Examples-Applications/SimpleConvolution/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = SimpleConvolution.cpp
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./SimpleConvolution
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | CXX=$(HIPCC)
16 | 
17 | 
18 | $(EXECUTABLE): $(OBJECTS)
19 | 	$(HIPCC) $(OBJECTS) -o $@
20 | 
21 | 
22 | test: $(EXECUTABLE)
23 | 	$(EXECUTABLE)
24 | 
25 | 
26 | clean:
27 | 	rm -f $(EXECUTABLE)
28 | 	rm -f $(OBJECTS)
29 | 	rm -f $(HIP_PATH)/src/*.o
30 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.cpp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/SimpleConvolution/SimpleConvolution.hpp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/dct/Makefile:
--------------------------------------------------------------------------------
 1 | HIPCC=/opt/rocm/bin/hipcc
 2 | 
 3 | SOURCES = dct.cpp
 4 | OBJECTS = $(SOURCES:.cpp=.o)
 5 | 
 6 | EXECUTABLE=./dct
 7 | 
 8 | .PHONY: test
 9 | 
10 | 
11 | all: $(EXECUTABLE) test
12 | 
13 | CXXFLAGS =-g
14 | CXX=$(HIPCC)
15 | 
16 | 
17 | $(EXECUTABLE): $(OBJECTS)
18 | 	$(HIPCC) $(OBJECTS) -o $@
19 | 
20 | 
21 | test: $(EXECUTABLE)
22 | 	$(EXECUTABLE)
23 | 
24 | 
25 | clean:
26 | 	rm -f $(EXECUTABLE)
27 | 	rm -f $(OBJECTS)
28 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/dwtHaar1D/Makefile:
--------------------------------------------------------------------------------
 1 | HIPCC=/opt/rocm/bin/hipcc
 2 | 
 3 | SOURCES = dwtHaar1D.cpp
 4 | OBJECTS = $(SOURCES:.cpp=.o)
 5 | 
 6 | EXECUTABLE=./dwtHaar1D
 7 | 
 8 | .PHONY: test
 9 | 
10 | 
11 | all: $(EXECUTABLE) test
12 | 
13 | CXXFLAGS =-g
14 | CXX=$(HIPCC)
15 | 
16 | 
17 | $(EXECUTABLE): $(OBJECTS)
18 | 	$(HIPCC) $(OBJECTS) -o $@
19 | 
20 | 
21 | test: $(EXECUTABLE)
22 | 	$(EXECUTABLE)
23 | 
24 | 
25 | clean:
26 | 	rm -f $(EXECUTABLE)
27 | 	rm -f $(OBJECTS)
28 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/include/HIPUtil.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | 
 23 | #ifndef HIPSAMPLE_H_
 24 | #define HIPSAMPLE_H_
 25 | 
 26 | /******************************************************************************
 27 | * Included header files                                                       *
 28 | ******************************************************************************/
 29 | 
 30 | #include "SDKUtil.hpp"
 31 | 
 32 | namespace appsdk
 33 | {
 34 | 
 35 | class HIPCommandArgs: public SDKCmdArgsParser
 36 | {
 37 |     public:
 38 |         unsigned int deviceId;       /**< Cmd Line Option- if deviceId */
 39 |         bool enableDeviceId;         /**< Cmd Line Option- if enableDeviceId */
 40 | 
 41 |         /**
 42 |         ***********************************************************************
 43 |         * @fn initialize
 44 |         * @brief Initialize the resources used by tests
 45 |         * @return 0 on success Positive if expected and Non-zero on failure
 46 |         **********************************************************************/
 47 |         int initialize()
 48 |         {
 49 |             int defaultOptions = 5;
 50 |             Option *optionList = new Option[defaultOptions];
 51 |             CHECK_ALLOCATION(optionList, "Error. Failed to allocate memory (optionList)\n");
 52 |             optionList[0]._sVersion = "q";
 53 |             optionList[0]._lVersion = "quiet";
 54 |             optionList[0]._description = "Quiet mode. Suppress all text output.";
 55 |             optionList[0]._type = CA_NO_ARGUMENT;
 56 |             optionList[0]._value = &quiet;
 57 |             optionList[1]._sVersion = "e";
 58 |             optionList[1]._lVersion = "verify";
 59 |             optionList[1]._description = "Verify results against reference implementation.";
 60 |             optionList[1]._type = CA_NO_ARGUMENT;
 61 |             optionList[1]._value = &verify;
 62 |             optionList[2]._sVersion = "t";
 63 |             optionList[2]._lVersion = "timing";
 64 |             optionList[2]._description = "Print timing.";
 65 |             optionList[2]._type = CA_NO_ARGUMENT;
 66 |             optionList[2]._value = &timing;
 67 |             optionList[3]._sVersion = "v";
 68 |             optionList[3]._lVersion = "version";
 69 |             optionList[3]._description = "AMD APP SDK version string.";
 70 |             optionList[3]._type = CA_NO_ARGUMENT;
 71 |             optionList[3]._value = &version;
 72 |             optionList[4]._sVersion = "d";
 73 |             optionList[4]._lVersion = "deviceId";
 74 |             optionList[4]._description =
 75 |                 "Select deviceId to be used[0 to N-1 where N is number devices available].";
 76 |             optionList[4]._type = CA_ARG_INT;
 77 |             optionList[4]._value = &deviceId;
 78 |             _numArgs = defaultOptions;
 79 |             _options = optionList;
 80 |             return SDK_SUCCESS;
 81 |         }
 82 | 
 83 |         /**
 84 |         ***********************************************************************
 85 |         * @brief Destroy the resources used by tests
 86 |         **********************************************************************/
 87 |         virtual ~HIPCommandArgs()
 88 |         {
 89 |         }
 90 | 
 91 |         /**
 92 |         ***********************************************************************
 93 |         * @brief Constructor, initialize the resources used by tests
 94 |         * @param sampleName Name of the Sample
 95 |         **********************************************************************/
 96 |         HIPCommandArgs()
 97 |         {
 98 |             deviceId = 0;
 99 |             enableDeviceId = false;
100 |         }
101 | 
102 |         /**
103 |         ***********************************************************************
104 |         * @brief parseCommandLine parses the command line options given by user
105 |         * @param argc Number of elements in cmd line input
106 |         * @param argv array of char* storing the CmdLine Options
107 |         * @return 0 on success Positive if expected and Non-zero on failure
108 |         **********************************************************************/
109 |         int parseCommandLine(int argc, char **argv)
110 |         {
111 |             if(!parse(argv,argc))
112 |             {
113 |                 usage();
114 |                 if(isArgSet("h",true) == true)
115 |                 {
116 |                     exit(SDK_SUCCESS);
117 |                 }
118 |                 return SDK_FAILURE;
119 |             }
120 |             if(isArgSet("h",true) == true)
121 |             {
122 |                 usage();
123 |                 exit(SDK_SUCCESS);
124 |             }
125 |             if(isArgSet("v", true)
126 |                     || isArgSet("version", false))
127 |             {
128 |                 std::cout << "SDK version : " << sampleVerStr.c_str()
129 |                           << std::endl;
130 |                 exit(0);
131 |             }
132 |             if(isArgSet("d",true)
133 |                     || isArgSet("deviceId",false))
134 |             {
135 |                 enableDeviceId = true;
136 |             }
137 |             return SDK_SUCCESS;
138 |         }
139 | };
140 | 
141 | }
142 | #endif
143 | 


--------------------------------------------------------------------------------
/HIP-Examples-Applications/include/SDKBitMap.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/include/SDKBitMap.hpp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/include/SDKFile.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/include/SDKFile.hpp


--------------------------------------------------------------------------------
/HIP-Examples-Applications/include/SDKThread.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/HIP-Examples-Applications/include/SDKThread.hpp


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HIP-Examples
 2 | 
 3 | ## Deprecation Notice
 4 | Please note that AMD will deprecate and archive the `hip-examples` repository. Please visit [rocm-examples](https://github.com/ROCm/rocm-examples), the new home for ROCm examples.
 5 | 
 6 | ## Examples for HIP.
 7 | This depot should be extracted into the root directory of an existing HIP depot.
 8 | 
 9 | We managed to push the following benchmarks with HIP upstreamed on github:
10 | 
11 | * mixbench: <https://github.com/ekondis/mixbench>
12 | * GPU-Stream: <https://github.com/UoB-HPC/GPU-STREAM>
13 | 
14 | mixbench and GPU-Stream have been added as submodules for this repository, to fetch data for submodules:
15 | 
16 | ```bash
17 |     git submodule init
18 |     git submodule update
19 | ```
20 | 


--------------------------------------------------------------------------------
/add4/LICENSE:
--------------------------------------------------------------------------------
 1 | *==============================================================================
 2 | *------------------------------------------------------------------------------
 3 | * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 5 | *------------------------------------------------------------------------------
 6 | * License:
 7 | *  1. You are free to use this program and/or to redistribute
 8 | *     this program.
 9 | *  2. You are free to modify this program for your own use,
10 | *     including commercial use, subject to the publication
11 | *     restrictions in item 3.
12 | *  3. You are free to publish results obtained from running this
13 | *     program, or from works that you derive from this program,
14 | *     with the following limitations:
15 | *     3a. In order to be referred to as "GPU-STREAM benchmark results",
16 | *         published results must be in conformance to the GPU-STREAM
17 | *         Run Rules published at
18 | *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
19 | *         and incorporated herein by reference.
20 | *         The copyright holders retain the
21 | *         right to determine conformity with the Run Rules.
22 | *     3b. Results based on modified source code or on runs not in
23 | *         accordance with the GPU-STREAM Run Rules must be clearly
24 | *         labelled whenever they are published.  Examples of
25 | *         proper labelling include:
26 | *         "tuned GPU-STREAM benchmark results"
27 | *         "based on a variant of the GPU-STREAM benchmark code"
28 | *         Other comparable, clear and reasonable labelling is
29 | *         acceptable.
30 | *     3c. Submission of results to the GPU-STREAM benchmark web site
31 | *         is encouraged, but not required.
32 | *  4. Use of this program or creation of derived works based on this
33 | *     program constitutes acceptance of these licensing restrictions.
34 | *  5. Absolutely no warranty is expressed or implied.
35 | *———————————————————————————————————-------------------------------------------
36 | 


--------------------------------------------------------------------------------
/add4/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -std=c++11 -O3
 2 | 
 3 | all:  gpu-stream-hip
 4 | 
 5 | common.o: common.cpp common.h Makefile
 6 | 
 7 | HIP_PATH?= $(wildcard /opt/rocm)
 8 | 
 9 | HIPCC=$(HIP_PATH)/bin/hipcc
10 | 
11 | hip-stream.o : hip-stream.cpp
12 | 	$(HIPCC) $(CXXFLAGS) -c $< -o $@
13 | 
14 | gpu-stream-hip: hip-stream.o common.o Makefile
15 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
16 | 	$(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@
17 | else
18 | 	$(error "Cannot find $(HIPCC), please install HIP toolkit")
19 | endif
20 | 
21 | 
22 | .PHONY: clean
23 | 
24 | clean:
25 | 	rm -f   gpu-stream-hip *.o
26 | 
27 | 


--------------------------------------------------------------------------------
/add4/README.md:
--------------------------------------------------------------------------------
 1 | Add4
 2 | =========
 3 | This benchmark is derived from the GPU-STREAM benchmark. 
 4 | To increase the portion of read in the benchmark "add" kernel, we increase the number of array for "add" from two to four.
 5 | After modification, we could achieve 90% efficiency for FIJI Nano GPU.
 6 | 
 7 | 
 8 | GPU-STREAM
 9 | ==========
10 | 
11 | Measure memory transfer rates to/from global device memory on GPUs.
12 | This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for CPUs.
13 | 
14 | Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time.
15 | 
16 | Usage
17 | -----
18 | 
19 | Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5)
20 | 
21 | Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
22 | 
23 | For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
24 | http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
25 | Install the HCC compiler:
26 | https://bitbucket.org/multicoreware/hcc/wiki/Home
27 | Install HIP:
28 | https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
29 | 
30 | Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
31 | 
32 | Android
33 | -------
34 | 
35 | Assuming you have a recent Android NDK available, you can use the
36 | toolchain that it provides to build GPU-STREAM. You should first
37 | use the NDK to generate a standalone toolchain:
38 | 
39 |     # Select a directory to install the toolchain to
40 |     ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
41 | 
42 |     ${NDK}/build/tools/make-standalone-toolchain.sh \
43 |       --platform=android-14 \
44 |       --toolchain=arm-linux-androideabi-4.8 \
45 |       --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
46 | 
47 | Make sure that the OpenCL headers and library (libOpenCL.so) are
48 | available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
49 | 
50 | You should then be able to build GPU-STREAM:
51 | 
52 |     make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
53 | 
54 | Copy the executable and OpenCL kernels to the device:
55 | 
56 |     adb push gpu-stream-ocl /data/local/tmp
57 |     adb push ocl-stream-kernels.cl /data/local/tmp
58 | 
59 | Run GPU-STREAM from an adb shell:
60 | 
61 |     adb shell
62 |     cd /data/local/tmp
63 | 
64 |     # Use float if device doesn't support double, and reduce array size
65 |     ./gpu-stream-ocl --float -n 6 -s 10000000
66 | 
67 | Results
68 | -------
69 | 
70 | Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request.
71 | 
72 | [1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995.
73 | 


--------------------------------------------------------------------------------
/add4/buildit.sh:
--------------------------------------------------------------------------------
1 | make clean
2 | make gpu-stream-hip
3 | 


--------------------------------------------------------------------------------
/add4/common.cpp:
--------------------------------------------------------------------------------
  1 | /*=============================================================================
  2 | *------------------------------------------------------------------------------
  3 | * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
  4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs
  5 | *------------------------------------------------------------------------------
  6 | * License:
  7 | *  1. You are free to use this program and/or to redistribute
  8 | *     this program.
  9 | *  2. You are free to modify this program for your own use,
 10 | *     including commercial use, subject to the publication
 11 | *     restrictions in item 3.
 12 | *  3. You are free to publish results obtained from running this
 13 | *     program, or from works that you derive from this program,
 14 | *     with the following limitations:
 15 | *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 16 | *         published results must be in conformance to the GPU-STREAM
 17 | *         Run Rules published at
 18 | *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 19 | *         and incorporated herein by reference.
 20 | *         The copyright holders retain the
 21 | *         right to determine conformity with the Run Rules.
 22 | *     3b. Results based on modified source code or on runs not in
 23 | *         accordance with the GPU-STREAM Run Rules must be clearly
 24 | *         labelled whenever they are published.  Examples of
 25 | *         proper labelling include:
 26 | *         "tuned GPU-STREAM benchmark results"
 27 | *         "based on a variant of the GPU-STREAM benchmark code"
 28 | *         Other comparable, clear and reasonable labelling is
 29 | *         acceptable.
 30 | *     3c. Submission of results to the GPU-STREAM benchmark web site
 31 | *         is encouraged, but not required.
 32 | *  4. Use of this program or creation of derived works based on this
 33 | *     program constitutes acceptance of these licensing restrictions.
 34 | *  5. Absolutely no warranty is expressed or implied.
 35 | *———————————————————————————————————-----------------------------------------*/
 36 | 
 37 | #include "common.h"
 38 | 
 39 | // Default array size 50 * 2^20 (50*8 Mebibytes double precision)
 40 | // Use binary powers of two so divides 1024
 41 | //unsigned int ARRAY_SIZE = 52428800;
 42 | unsigned int ARRAY_SIZE = 26214400;
 43 | size_t   ARRAY_PAD_BYTES  = 0;
 44 | 
 45 | unsigned int NTIMES = 10;
 46 | 
 47 | bool useFloat = false;
 48 | unsigned int  groups   = 0;
 49 | unsigned int  groupSize   = 1024;
 50 | 
 51 | unsigned int deviceIndex = 0;
 52 | 
 53 | int parseUInt(const char *str, unsigned int *output)
 54 | {
 55 |     char *next;
 56 |     *output = strtoul(str, &next, 10);
 57 |     return !strlen(next);
 58 | }
 59 | 
 60 | int parseSize(const char *str, size_t *output)
 61 | {
 62 |     char *next;
 63 |     *output = strtoull(str, &next, 0);
 64 | 	int l = strlen(str);
 65 | 	if (l) {
 66 | 		char c = str[l-1]; // last char.
 67 | 		if ((c == 'k') || (c == 'K')) {
 68 | 			*output *= 1024;
 69 | 		}
 70 | 		if ((c == 'm') || (c == 'M')) {
 71 | 			*output *= (1024*1024);
 72 | 		}
 73 | 
 74 | 	}
 75 |     return !strlen(next);
 76 | }
 77 | 
 78 | 
 79 | void parseArguments(int argc, char *argv[])
 80 | {
 81 |     for (int i = 1; i < argc; i++)
 82 |     {
 83 |         if (!strcmp(argv[i], "--list"))
 84 |         {
 85 |             listDevices();
 86 |             exit(0);
 87 |         }
 88 |         else if (!strcmp(argv[i], "--device"))
 89 |         {
 90 |             if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
 91 |             {
 92 |                 std::cout << "Invalid device index" << std::endl;
 93 |                 exit(1);
 94 |             }
 95 |         }
 96 |         else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s"))
 97 |         {
 98 |             if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
 99 |             {
100 |                 std::cout << "Invalid array size" << std::endl;
101 |                 exit(1);
102 |             }
103 |         }
104 |         else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n"))
105 |         {
106 |             if (++i >= argc || !parseUInt(argv[i], &NTIMES))
107 |             {
108 |                 std::cout << "Invalid number of times" << std::endl;
109 |                 exit(1);
110 |             }
111 |         }
112 |         else if (!strcmp(argv[i], "--groups"))
113 |         {
114 |             if (++i >= argc || !parseUInt(argv[i], &groups))
115 |             {
116 |                 std::cout << "Invalid group number" << std::endl;
117 |                 exit(1);
118 |             }
119 |         }
120 |         else if (!strcmp(argv[i], "--groupSize"))
121 |         {
122 |             if (++i >= argc || !parseUInt(argv[i], &groupSize))
123 |             {
124 |                 std::cout << "Invalid group size" << std::endl;
125 |                 exit(1);
126 |             }
127 |         }
128 |         else if (!strcmp(argv[i], "--pad"))
129 |         {
130 |             if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
131 |             {
132 |                 std::cout << "Invalid size" << std::endl;
133 |                 exit(1);
134 |             }
135 | 
136 |         }
137 |         else if (!strcmp(argv[i], "--float"))
138 |         {
139 |             useFloat = true;
140 |             std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
141 |         }
142 |         else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
143 |         {
144 |             std::cout << std::endl;
145 |             std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl;
146 |             std::cout << "Options:" << std::endl;
147 |             std::cout << "  -h  --help               Print the message" << std::endl;
148 |             std::cout << "      --list               List available devices" << std::endl;
149 |             std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
150 |             std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
151 |             std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
152 |             std::cout << "      --groups             Set number of groups to launch -  each work-item proceses multiple array items" << std::endl;
153 |             std::cout << "      --groupSize          Set size of each group (default 1024)" << std::endl;
154 |             std::cout << "      --pad                Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
155 |             std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
156 |             std::cout << std::endl;
157 |             exit(0);
158 |         }
159 |         else
160 |         {
161 |             std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')"
162 |                 << std::endl;
163 |             exit(1);
164 |         }
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/add4/common.h:
--------------------------------------------------------------------------------
  1 | /*=============================================================================
  2 | *------------------------------------------------------------------------------
  3 | * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
  4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs
  5 | *------------------------------------------------------------------------------
  6 | * License:
  7 | *  1. You are free to use this program and/or to redistribute
  8 | *     this program.
  9 | *  2. You are free to modify this program for your own use,
 10 | *     including commercial use, subject to the publication
 11 | *     restrictions in item 3.
 12 | *  3. You are free to publish results obtained from running this
 13 | *     program, or from works that you derive from this program,
 14 | *     with the following limitations:
 15 | *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 16 | *         published results must be in conformance to the GPU-STREAM
 17 | *         Run Rules published at
 18 | *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 19 | *         and incorporated herein by reference.
 20 | *         The copyright holders retain the
 21 | *         right to determine conformity with the Run Rules.
 22 | *     3b. Results based on modified source code or on runs not in
 23 | *         accordance with the GPU-STREAM Run Rules must be clearly
 24 | *         labelled whenever they are published.  Examples of
 25 | *         proper labelling include:
 26 | *         "tuned GPU-STREAM benchmark results"
 27 | *         "based on a variant of the GPU-STREAM benchmark code"
 28 | *         Other comparable, clear and reasonable labelling is
 29 | *         acceptable.
 30 | *     3c. Submission of results to the GPU-STREAM benchmark web site
 31 | *         is encouraged, but not required.
 32 | *  4. Use of this program or creation of derived works based on this
 33 | *     program constitutes acceptance of these licensing restrictions.
 34 | *  5. Absolutely no warranty is expressed or implied.
 35 | *———————————————————————————————————-----------------------------------------*/
 36 | 
 37 | #include <iomanip>
 38 | #include <iostream>
 39 | #include <cstdlib>
 40 | #include <cstring>
 41 | #include <limits>
 42 | #include <stdexcept>
 43 | 
 44 | #define VERSION_STRING "1.0"
 45 | 
 46 | extern void parseArguments(int argc, char *argv[]);
 47 | 
 48 | extern void listDevices(void);
 49 | 
 50 | extern unsigned int ARRAY_SIZE;
 51 | extern size_t       ARRAY_PAD_BYTES;
 52 | extern unsigned int NTIMES;
 53 | 
 54 | extern unsigned int  groups;
 55 | extern unsigned int  groupSize;
 56 | extern bool useFloat;
 57 | 
 58 | extern unsigned int deviceIndex;
 59 | 
 60 | 
 61 | template < typename T >
 62 | void check_solution(void* a_in, void* b_in, void* c_in)
 63 | {
 64 |     // Generate correct solution
 65 |     T golda = 1.0;
 66 |     T goldb = 2.0;
 67 |     T goldc = 0.0;
 68 |     T goldd = 1.0;
 69 |     T golde = 1.0;
 70 |     T * a = static_cast<T*>(a_in);
 71 |     T * b = static_cast<T*>(b_in);
 72 |     T * c = static_cast<T*>(c_in);
 73 | 
 74 |     const T scalar = 3.0;
 75 | 
 76 |     for (unsigned int i = 0; i < NTIMES; i++)
 77 |     {
 78 |         // Double
 79 |         goldc = golda;
 80 |         goldb = scalar * goldc;
 81 |         goldc = golda + goldb + goldd + golde;
 82 |         golda = goldb + scalar * goldc;
 83 |     }
 84 | 
 85 |     // Calculate average error
 86 |     double erra = 0.0;
 87 |     double errb = 0.0;
 88 |     double errc = 0.0;
 89 | 
 90 |     for (unsigned int i = 0; i < ARRAY_SIZE; i++)
 91 |     {
 92 |         erra += fabs(a[i] - golda);
 93 |         errb += fabs(b[i] - goldb);
 94 |         errc += fabs(c[i] - goldc);
 95 |     }
 96 | 
 97 |     erra /= ARRAY_SIZE;
 98 |     errb /= ARRAY_SIZE;
 99 |     errc /= ARRAY_SIZE;
100 | 
101 |     double epsi = std::numeric_limits<T>::epsilon() * 100;
102 | 
103 |     if (erra > epsi)
104 |         std::cout
105 |             << "Validation failed on a[]. Average error " << erra
106 |             << std::endl;
107 |     if (errb > epsi)
108 |         std::cout
109 |             << "Validation failed on b[]. Average error " << errb
110 |             << std::endl;
111 |     if (errc > epsi)
112 |         std::cout
113 |             << "Validation failed on c[]. Average error " << errc
114 |             << std::endl;
115 | }
116 | 
117 | 


--------------------------------------------------------------------------------
/add4/run_sweep.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #@groups=(0, 64, 128, 192, 256, 384, 512);
 4 | @groups=(64, 128, 192, 256, 384, 512);
 5 | @groupSize=(64, 128, 192, 256, 384, 512, 1024);
 6 | 
 7 | 
 8 | foreach $g (@groups) {
 9 |     foreach $gs (@groupSize) {
10 |         $f = "hipstream.float.$g.$gs";
11 |         $cmd = "./gpu-stream-hip --float --groups $g --groupSize $gs";
12 |         print "Run $f : $cmd\n";
13 |          
14 |         system "$cmd > $f";
15 | 
16 |     }
17 | }
18 | 
19 | foreach $g (@groups) {
20 |     foreach $gs (@groupSize) {
21 |         $f = "hipstream.double.$g.$gs";
22 |         $cmd = "./gpu-stream-hip --groups $g --groupSize $gs";
23 |         print "Run $f : $cmd\n";
24 |          
25 |         system "$cmd > $f";
26 | 
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/add4/runhip.sh:
--------------------------------------------------------------------------------
1 | echo ./gpu-stream-hip
2 | ./gpu-stream-hip
3 | echo ./gpu-stream-hip --groups 256 --groupSize 256
4 | ./gpu-stream-hip --groups 256 --groupSize 256
5 | echo ./gpu-stream-hip --float
6 | ./gpu-stream-hip --float
7 | echo ./gpu-stream-hip --float --groups 256 --groupSize 256
8 | ./gpu-stream-hip --float --groups 256 --groupSize 256
9 | 


--------------------------------------------------------------------------------
/common/hip.all.make:
--------------------------------------------------------------------------------
1 | include $(HIP_PATH)/examples/common/hip.prologue.make
2 | include $(HIP_PATH)/examples/common/hip.epilogue.make
3 | 


--------------------------------------------------------------------------------
/common/hip.prologue.make:
--------------------------------------------------------------------------------
 1 | # This file is designed to be included at beginning of Makefile, right after setting HIP_PATH.
 2 | # Note: define $HIP_PATH before including this file.
 3 | # HIP_PATH should be relevant to the parent makefile
 4 | #
 5 | # It should not include any concrete makefile steps, so "make" still runs the first step in the Makefile.
 6 | #
 7 | 
 8 | #------
 9 | ##Provide default if not already set:
10 | HIP_PATH?=../..
11 | HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
12 | 
13 | # CUDA toolkit installation path
14 | CUDA_DIR?=/usr/local/cuda-7.5
15 | # CUDA toolkit libraries
16 | CUDA_LIB_DIR := $(CUDA_DIR)/lib
17 | ifeq ($(shell uname -m), x86_64)
18 |   ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T)
19 |     CUDA_LIB_DIR := $(CUDA_DIR)/lib64
20 |   endif
21 | endif
22 | 
23 | # Some samples mix openmp with gpu acceleration.
24 | # Those unfortunately have to be compiled with gcc, not clang.
25 | # nvcc (7.5) can handle openmp though. 
26 | # use OMPCC and OMP_FLAGS
27 | 
28 | HIPCC=$(HIP_PATH)/bin/hipcc
29 | HIPLD=$(HIP_PATH)/bin/hipcc
30 | 
31 | #-- 
32 | # Set up automatic make of HIP cpp depenendencies
33 | # TODO - this can be removed when HIP has a proper make structure.
34 | #HIP_SOURCES = $(HIP_PATH)/src/hip_hcc.cpp
35 | 
36 | HIPCC_FLAGS += -I../../common
37 | # 'make dbg=1' enables HIPCC debugging and no opt switch.
38 | ifeq ($(dbg),1)
39 | 	HIPCC_FLAGS += -g
40 | 	OMP_FLAGS += -g
41 | else ifeq ($(opt),0)
42 | 	HIPCC_FLAGS += -O0
43 | 	OMP_FLAGS += -O0
44 | else ifeq ($(opt),3)
45 | 	HIPCC_FLAGS += -O3
46 | 	OMP_FLAGS += -O3
47 | else
48 | 	HIPCC_FLAGS += -O2
49 | 	OMP_FLAGS += -O2
50 | endif
51 | 
52 | ifeq ($(HIP_PLATFORM), nvcc)
53 | OMPCC = gcc
54 | OMP_FLAGS = $(HIPCC_FLAGS)
55 | HIP_DEPS = 
56 | 
57 | else ifeq ($(HIP_PLATFORM), hcc)
58 | #HIP_DEPS = $(HIP_SOURCES:.cpp=.o)
59 | OMPCC = gcc
60 | OMP_FLAGS += -fopenmp
61 | 
62 | # Add dependencies to make hip_cc.o and other support files.
63 | HSA_PATH ?= /opt/hsa
64 | #HIP_SOURCES = $(HIP_PATH)/src/hip_hcc.cpp
65 | #HIP_DEPS = $(HIP_SOURCES:.cpp=.o)
66 | #$(HIP_DEPS): HIPCC_FLAGS += -I$(HSA_PATH)/include
67 | %.o:: %.cpp
68 | 	    $(HIPCC) $(HIPCC_FLAGS) $< -c -o $@
69 | endif
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | #------
79 | #
80 | #---
81 | # Rule for automatic HIPIFY call - assumes original cuda files are stored in local 'cusrc' directory.  See kmeans.
82 | #%.cu : cusrc/%.cu
83 | #	$(HIPIFY)  $< > $@
84 | #%.cuh : cusrc/%.cuh
85 | #	$(HIPIFY)  $< > $@
86 | 
87 | 
88 | KCFLAGS += $(OPT) -I$(HSA_PATH)/include  -I$(HIP_PATH)/include -I$(GRID_LAUNCH_PATH) -I$(AM_PATH)/include
89 | 
90 | %.o:: %.cpp
91 | 	$(HIPCC) $(HIPCC_FLAGS) $< -c -o $@
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/cuda-stream/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | 
 3 | HIPCC=$(HIP_PATH)/bin/hipcc
 4 | 
 5 | CXXFLAGS += -std=c++11 -O3
 6 | 
 7 | stream: stream.cpp
 8 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
 9 | 	${HIPCC} ${CXXFLAGS} -o $@ $^ 
10 | else
11 | 	$(error "Cannot find $(HIPCC), please install HIP toolkit")
12 | endif
13 | 
14 | .PHONY: clean
15 | 
16 | clean:
17 | 	rm -f stream *.o
18 | 
19 | 


--------------------------------------------------------------------------------
/cuda-stream/Makefile.titan:
--------------------------------------------------------------------------------
 1 | CC=gcc
 2 | ARCH=sm_35
 3 | 
 4 | stream : stream.cpp
 5 | 	hipcc -std=c++11 -ccbin=$(CC)  stream.cpp -arch=$(ARCH) -o stream
 6 | 
 7 | 
 8 | clean :
 9 | 	rm -f stream
10 | 


--------------------------------------------------------------------------------
/cuda-stream/README.md:
--------------------------------------------------------------------------------
 1 | The benchmark is modified from STREAM benchmark implementation with the following kernels:
 2 |     COPY:       a(i) = b(i)
 3 |     SCALE:      a(i) = q*b(i)
 4 |     SUM:        a(i) = b(i) + c(i)
 5 |     TRIAD:      a(i) = b(i) + q*c(i)
 6 | 
 7 | To compile HIP version:
 8 |     make
 9 | To execute:
10 |     ./stream
11 | 
12 | To compile on NV node, use Makefile.titan.
13 | 


--------------------------------------------------------------------------------
/cuda-stream/stream.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   STREAM benchmark implementation in CUDA.
  3 | 
  4 |     COPY:       a(i) = b(i)
  5 |     SCALE:      a(i) = q*b(i)
  6 |     SUM:        a(i) = b(i) + c(i)
  7 |     TRIAD:      a(i) = b(i) + q*c(i)
  8 | 
  9 |   It measures the memory system on the device.
 10 |   The implementation is in single precision.
 11 | 
 12 |   Code based on the code developed by John D. McCalpin
 13 |   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
 14 | 
 15 |   Written by: Massimiliano Fatica, NVIDIA Corporation
 16 | 
 17 |   Further modifications by: Ben Cumming, CSCS
 18 | 
 19 |   Ported to HIP by: Peng Sun, AMD
 20 | */
 21 | 
 22 | #include "hip/hip_runtime.h"
 23 | #define NTIMES  20
 24 | 
 25 | #include <string>
 26 | #include <vector>
 27 | 
 28 | #include <stdio.h>
 29 | #include <float.h>
 30 | #include <limits.h>
 31 | #include <unistd.h>
 32 | #include <sys/time.h>
 33 | 
 34 | #include <sys/time.h>
 35 | 
 36 | # ifndef MIN
 37 | # define MIN(x,y) ((x)<(y)?(x):(y))
 38 | # endif
 39 | # ifndef MAX
 40 | # define MAX(x,y) ((x)>(y)?(x):(y))
 41 | # endif
 42 | 
 43 | typedef double real;
 44 | 
 45 | static double   avgtime[4] = {0}, maxtime[4] = {0},
 46 |         mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
 47 | 
 48 | 
 49 | void print_help()
 50 | {
 51 |     printf(
 52 |         "Usage: stream [-s] [-n <elements>] [-b <blocksize>]\n\n"
 53 |         "  -s\n"
 54 |         "        Print results in SI units (by default IEC units are used)\n\n"
 55 |         "  -n <elements>\n"
 56 |         "        Put <elements> values in the arrays\n"
 57 |         "        (defaults to 1<<26)\n\n"
 58 |         "  -b <blocksize>\n"
 59 |         "        Use <blocksize> as the number of threads in each block\n"
 60 |         "        (defaults to 192)\n"
 61 |     );
 62 | }
 63 | 
 64 | void parse_options(int argc, char** argv, bool& SI, int& N, int& blockSize)
 65 | {
 66 |     // Default values
 67 |     SI = false;
 68 |     N = 1<<26;
 69 |     blockSize = 192;
 70 | 
 71 |     int c;
 72 | 
 73 |     while ((c = getopt (argc, argv, "sn:b:h")) != -1)
 74 |         switch (c)
 75 |         {
 76 |             case 's':
 77 |                 SI = true;
 78 |                 break;
 79 |             case 'n':
 80 |                 N = std::atoi(optarg);
 81 |                 break;
 82 |             case 'b':
 83 |                 blockSize = std::atoi(optarg);
 84 |                 break;
 85 |             case 'h':
 86 |                 print_help();
 87 |                 std::exit(0);
 88 |                 break;
 89 |             default:
 90 |                 print_help();
 91 |                 std::exit(1);
 92 |         }
 93 | }
 94 | 
 95 | /* A gettimeofday routine to give access to the wall
 96 |    clock timer on most UNIX-like systems.  */
 97 | 
 98 | 
 99 | double mysecond()
100 | {
101 |     struct timeval tp;
102 |     struct timezone tzp;
103 |     int i = gettimeofday(&tp,&tzp);
104 |     return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
105 | }
106 | 
107 | 
108 | template <typename T>
109 | __global__ void set_array(T *a,  T value, int len)
110 | {
111 |     int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
112 |     if (idx < len)
113 |         a[idx] = value;
114 | }
115 | 
116 | template <typename T>
117 | __global__ void STREAM_Copy(T *a, T *b, int len)
118 | {
119 |     int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
120 |     if (idx < len)
121 |         b[idx] = a[idx];
122 | }
123 | 
124 | template <typename T>
125 | __global__ void STREAM_Scale(T *a, T *b, T scale,  int len)
126 | {
127 |     int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
128 |     if (idx < len)
129 |         b[idx] = scale* a[idx];
130 | }
131 | 
132 | template <typename T>
133 | __global__ void STREAM_Add(T *a, T *b, T *c,  int len)
134 | {
135 |     int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
136 |     if (idx < len)
137 |         c[idx] = a[idx]+b[idx];
138 | }
139 | 
140 | template <typename T>
141 | __global__ void STREAM_Triad(T *a, T *b, T *c, T scalar, int len)
142 | {
143 |     int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
144 |     if (idx < len)
145 |         c[idx] = a[idx]+scalar*b[idx];
146 | }
147 | 
148 | int main(int argc, char** argv)
149 | {
150 |     real *d_a, *d_b, *d_c;
151 |     int j,k;
152 |     double times[4][NTIMES];
153 |     real scalar;
154 |     std::vector<std::string> label{"Copy:      ", "Scale:     ", "Add:       ", "Triad:     "};
155 | 
156 |     // Parse arguments
157 |     bool SI;
158 |     int N, blockSize;
159 |     parse_options(argc, argv, SI, N, blockSize);
160 | 
161 |     printf(" STREAM Benchmark implementation in HIP\n");
162 |     printf(" Array size (%s precision) =%7.2f MB\n", sizeof(double)==sizeof(real)?"double":"single", double(N)*double(sizeof(real))/1.e6);
163 | 
164 |     /* Allocate memory on device */
165 |     hipMalloc((void**)&d_a, sizeof(real)*N);
166 |     hipMalloc((void**)&d_b, sizeof(real)*N);
167 |     hipMalloc((void**)&d_c, sizeof(real)*N);
168 | 
169 |     /* Compute execution configuration */
170 |     dim3 dimBlock(blockSize);
171 |     dim3 dimGrid(N/dimBlock.x );
172 |     if( N % dimBlock.x != 0 ) dimGrid.x+=1;
173 | 
174 |     printf(" using %d threads per block, %d blocks\n",dimBlock.x,dimGrid.x);
175 | 
176 |     if (SI)
177 |         printf(" output in SI units (KB = 1000 B)\n");
178 |     else
179 |         printf(" output in IEC units (KiB = 1024 B)\n");
180 | 
181 |     /* Initialize memory on the device */
182 |     hipLaunchKernelGGL(set_array<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_a, 2.f, N);
183 |     hipLaunchKernelGGL(set_array<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_b, .5f, N);
184 |     hipLaunchKernelGGL(set_array<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_c, .5f, N);
185 | 
186 |     /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */
187 | 
188 |     scalar=3.0f;
189 |     for (k=0; k<NTIMES; k++)
190 |     {
191 |         times[0][k]= mysecond();
192 |         hipLaunchKernelGGL(STREAM_Copy<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_a, d_c, N);
193 |         hipDeviceSynchronize();
194 |         times[0][k]= mysecond() -  times[0][k];
195 | 
196 |         times[1][k]= mysecond();
197 |         hipLaunchKernelGGL(STREAM_Scale<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_b, d_c, scalar,  N);
198 |         hipDeviceSynchronize();
199 |         times[1][k]= mysecond() -  times[1][k];
200 | 
201 |         times[2][k]= mysecond();
202 |         hipLaunchKernelGGL(STREAM_Add<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_a, d_b, d_c,  N);
203 |         hipDeviceSynchronize();
204 |         times[2][k]= mysecond() -  times[2][k];
205 | 
206 |         times[3][k]= mysecond();
207 |         hipLaunchKernelGGL(STREAM_Triad<real>, dim3(dimGrid), dim3(dimBlock), 0, 0, d_b, d_c, d_a, scalar,  N);
208 |         hipDeviceSynchronize();
209 |         times[3][k]= mysecond() -  times[3][k];
210 |     }
211 | 
212 |     /*  --- SUMMARY --- */
213 | 
214 |     for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
215 |     {
216 |         for (j=0; j<4; j++)
217 |         {
218 |             avgtime[j] = avgtime[j] + times[j][k];
219 |             mintime[j] = MIN(mintime[j], times[j][k]);
220 |             maxtime[j] = MAX(maxtime[j], times[j][k]);
221 |         }
222 |     }
223 | 
224 |     double bytes[4] = {
225 |         2 * sizeof(real) * (double)N,
226 |         2 * sizeof(real) * (double)N,
227 |         3 * sizeof(real) * (double)N,
228 |         3 * sizeof(real) * (double)N
229 |     };
230 | 
231 |     // Use right units
232 |     const double G = SI ? 1.e9 : static_cast<double>(1<<30);
233 | 
234 |     printf("\nFunction      Rate %s  Avg time(s)  Min time(s)  Max time(s)\n",
235 |            SI ? "(GB/s) " : "(GiB/s)" );
236 |     printf("-----------------------------------------------------------------\n");
237 |     for (j=0; j<4; j++) {
238 |         avgtime[j] = avgtime[j]/(double)(NTIMES-1);
239 | 
240 |         printf("%s%11.4f     %11.8f  %11.8f  %11.8f\n", label[j].c_str(),
241 |                 bytes[j]/mintime[j] / G,
242 |                 avgtime[j],
243 |                 mintime[j],
244 |                 maxtime[j]);
245 |     }
246 | 
247 | 
248 |     /* Free memory on device */
249 |     hipFree(d_a);
250 |     hipFree(d_b);
251 |     hipFree(d_c);
252 | }
253 | 
254 | 


--------------------------------------------------------------------------------
/gpu-burn/AmdGpuMonitor.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Public domain.  No warranty.
 3 |  * Ville Timonen 2013
 4 |  * edited by Timmy Liu for HIP API 01/2016
 5 |  */
 6 | 
 7 | #include <fstream>
 8 | #include <iostream>
 9 | #include "AmdGpuMonitor.h"
10 | 
11 | // ---------------------------------------------------------------------------
12 | namespace gpuburn {
13 | 
14 | AmdGpuMonitor::AmdGpuMonitor(int id, std::string hwmon)
15 |     : GpuMonitor(id), mHwmonPath(hwmon)
16 | {
17 | }
18 | 
19 | AmdGpuMonitor::~AmdGpuMonitor()
20 | {
21 | }
22 | 
23 | // ---------------------------------------------------------------------------
24 | 
25 | float AmdGpuMonitor::getTemperature()
26 | {
27 |     float gpuTemp = -1;
28 | 
29 |     std::ifstream tempFile((mHwmonPath + "/temp1_input").c_str());
30 |     if (tempFile.is_open()) {
31 |         tempFile >> gpuTemp;
32 |         tempFile.close();
33 |     }
34 | 
35 |     // Hwmon exposes temperatures in milliCelcius
36 |     return gpuTemp / 1000.0f;
37 | }
38 | 
39 | }; //namespace gpuburn
40 | 


--------------------------------------------------------------------------------
/gpu-burn/AmdGpuMonitor.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Public domain.  No warranty.
 3 |  * Ville Timonen 2013
 4 |  * edited by Timmy Liu for HIP API 01/2016
 5 |  */
 6 | 
 7 | #ifndef GPUBURN_AMDGPUMONITOR_H_
 8 | #define GPUBURN_AMDGPUMONITOR_H_
 9 | 
10 | #include <string>
11 | #include "GpuMonitor.h"
12 | 
13 | // ---------------------------------------------------------------------------
14 | namespace gpuburn {
15 | 
16 | class AmdGpuMonitor : public GpuMonitor {
17 |     public:
18 |         /**
19 |          * Initialize an AmdGpuMonitor instance
20 |          *
21 |          * @hwmonPath is the kernel hwmon resource associated to this GPU
22 |          */
23 |         AmdGpuMonitor(int id, std::string hwmonPath);
24 |         virtual ~AmdGpuMonitor();
25 | 
26 |         virtual float getTemperature();
27 | 
28 |     private:
29 |         std::string mHwmonPath;
30 | };
31 | 
32 | }; // namespace gpuburn
33 | 
34 | // ---------------------------------------------------------------------------
35 | 
36 | #endif // GPUBURN_AMDGPUMONITOR_H_
37 | 


--------------------------------------------------------------------------------
/gpu-burn/BurnKernel.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Public domain.  No warranty.
  3 |  * Ville Timonen 2013
  4 |  * edited by Timmy Liu for HIP API 01/2016
  5 |  */
  6 | 
  7 | #include <iostream>
  8 | #include <thread>
  9 | #include "hip/hip_runtime.h"
 10 | 
 11 | #include "common.h"
 12 | #include "BurnKernel.h"
 13 | 
 14 | // ---------------------------------------------------------------------------
 15 | namespace gpuburn {
 16 | 
 17 | constexpr int BurnKernel::cRandSeed;
 18 | constexpr float BurnKernel::cUseMem;
 19 | constexpr uint32_t BurnKernel::cRowSize;
 20 | constexpr uint32_t BurnKernel::cMatrixSize;
 21 | constexpr uint32_t BurnKernel::cBlockSize;
 22 | constexpr float BurnKernel::cAlpha;
 23 | constexpr float BurnKernel::cBeta;
 24 | 
 25 | BurnKernel::BurnKernel(int hipDevice)
 26 |     : mHipDevice(hipDevice), mRunKernel(false),
 27 |     mDeviceAdata(NULL), mDeviceBdata(NULL), mDeviceCdata(NULL)
 28 | {
 29 | }
 30 | 
 31 | BurnKernel::~BurnKernel()
 32 | {
 33 |     if (mBurnThread)
 34 |         mBurnThread->join();
 35 | 
 36 |     if (mDeviceAdata)
 37 |         hipFree(mDeviceAdata);
 38 | 
 39 |     if (mDeviceBdata)
 40 |         hipFree(mDeviceBdata);
 41 | 
 42 |     if (mDeviceCdata)
 43 |         hipFree(mDeviceCdata);
 44 | }
 45 | 
 46 | // ---------------------------------------------------------------------------
 47 | 
 48 | extern "C" __global__ void hip_sgemm_kernel(const int M,
 49 |                                             const int N, const int K,
 50 |                                             const float alpha,
 51 |                                             float *A, const int lda, float *B,
 52 |                                             const int ldb, const float beta,
 53 |                                             float *C, const int ldc)
 54 | {
 55 |         //column major NN
 56 |         size_t idx_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 57 |         size_t idx_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
 58 |         size_t dim_x = hipGridDim_x * hipBlockDim_x;
 59 |         size_t myIdx = idx_y * dim_x + idx_x;
 60 | 
 61 |         float local_c = beta * C[myIdx];
 62 | 
 63 |         for(int k = 0; k < K; k++) {
 64 |           local_c += alpha * A[ idx_y + k * K] * B[ idx_x * K + k];
 65 |         }
 66 | 
 67 |         C[myIdx] = local_c;
 68 | }
 69 | 
 70 | // ---------------------------------------------------------------------------
 71 | 
 72 | int BurnKernel::Init()
 73 | {
 74 |     int hipDevice = bindHipDevice();
 75 | 
 76 |     std::string msg = "Init Burn Thread for device (" + std::to_string(hipDevice) + ")\n";
 77 |     std::cout << msg;
 78 | 
 79 |     srand(cRandSeed);
 80 |     for (int i = 0; i < cMatrixSize; ++i) {
 81 |         mHostAdata[i] = (rand() % 1000000)/100000.0;
 82 |         mHostBdata[i] = (rand() % 1000000)/100000.0;
 83 |     }
 84 | 
 85 |     size_t freeMem = getAvailableMemory() * cUseMem;
 86 |     size_t matrixSizeBytes = sizeof(float)*cMatrixSize;
 87 |     mNumIterations = (freeMem - (matrixSizeBytes*2))/matrixSizeBytes;
 88 | 
 89 |     checkError(hipMalloc((void**)&mDeviceAdata, matrixSizeBytes), "Alloc A");
 90 |     checkError(hipMalloc((void**)&mDeviceBdata, matrixSizeBytes), "Alloc B");
 91 |     checkError(hipMalloc((void**)&mDeviceCdata, matrixSizeBytes*mNumIterations), "Alloc C");
 92 | 
 93 |     checkError(hipMemcpy(mDeviceAdata, mHostAdata, matrixSizeBytes, hipMemcpyHostToDevice), "A -> device");
 94 |     checkError(hipMemcpy(mDeviceBdata, mHostBdata, matrixSizeBytes, hipMemcpyHostToDevice), "B -> device");
 95 |     checkError(hipMemset(mDeviceCdata, 0, matrixSizeBytes*mNumIterations), "C memset");
 96 | 
 97 |     return 0;
 98 | }
 99 | 
100 | int BurnKernel::startBurn()
101 | {
102 |     mRunKernel = true;
103 | 
104 |     mBurnThread = make_unique<std::thread>(&BurnKernel::threadMain, this);
105 |     return 0;
106 | }
107 | 
108 | int BurnKernel::threadMain()
109 | {
110 |     int err = 0;
111 |     int hipDevice = bindHipDevice();
112 |     std::string msg = "Burn Thread using device (" + std::to_string(hipDevice) + ")\n";
113 |     std::cout << msg;
114 | 
115 |     while (mRunKernel) {
116 |         err = runComputeKernel();
117 |     }
118 | 
119 |     return err;
120 | }
121 | 
122 | int BurnKernel::stopBurn()
123 | {
124 |     int hipDevice = bindHipDevice();
125 | 
126 |     std::string msg = "Stopping burn thread on device (" + std::to_string(hipDevice) + ")\n";
127 |     std::cout << msg;
128 | 
129 |     mRunKernel = false;
130 |     return 0;
131 | }
132 | 
133 | int BurnKernel::bindHipDevice()
134 | {
135 |     int hipDevice = -1;
136 |     hipSetDevice(mHipDevice);
137 |     hipGetDevice(&hipDevice);
138 |     return hipDevice;
139 | }
140 | 
141 | int BurnKernel::runComputeKernel()
142 | {
143 |     int err = 0;
144 | 
145 |     for (int i = 0; mRunKernel && i < mNumIterations; ++i) {
146 |         hipLaunchKernelGGL(
147 |             /* Launch params */
148 |             hip_sgemm_kernel,
149 |             dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1),
150 |             dim3(cBlockSize,cBlockSize,1), 0, 0,
151 |             /* Kernel params */
152 |             cRowSize, cRowSize, cRowSize, cAlpha,
153 |             mDeviceAdata, cRowSize,
154 |             mDeviceBdata, cRowSize,
155 |             cBeta,
156 |             mDeviceCdata + i*cMatrixSize,
157 |             cRowSize);
158 |     }
159 |     checkError(hipDeviceSynchronize(), "Sync");
160 | 
161 |     return err;
162 | }
163 | 
164 | size_t BurnKernel::getAvailableMemory()
165 | {
166 |     size_t freeMem, totalMem;
167 |     checkError(hipMemGetInfo(&freeMem, &totalMem));
168 |     return freeMem;
169 | }
170 | 
171 | }; //namespace gpuburn
172 | 


--------------------------------------------------------------------------------
/gpu-burn/BurnKernel.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Public domain.  No warranty.
 3 |  * Ville Timonen 2013
 4 |  * edited by Timmy Liu for HIP API 01/2016
 5 |  */
 6 | 
 7 | #ifndef GPUBURN_BURNKERNEL_H_
 8 | #define GPUBURN_BURNKERNEL_H_
 9 | 
10 | #include <thread>
11 | 
12 | // ---------------------------------------------------------------------------
13 | namespace gpuburn {
14 | 
15 | /**
16 |  * The Gpu class abstracts interactions with the hardware
17 |  */
18 | class BurnKernel {
19 |     public:
20 |         BurnKernel(int hipDevice);
21 |         ~BurnKernel();
22 | 
23 |         int mHipDevice;
24 | 
25 |         int Init();
26 | 
27 |         /**
28 |          * Run a stress workload on mHipDevice
29 |          */
30 |         int startBurn();
31 | 
32 |         /**
33 |          * Stop the stress workload
34 |          */
35 |         int stopBurn();
36 | 
37 |     private:
38 |         static constexpr int cRandSeed = 10;
39 |         static constexpr float cUseMem = 0.80;
40 |         static constexpr uint32_t cRowSize = 512;
41 |         static constexpr uint32_t cMatrixSize = cRowSize * cRowSize;
42 |         static constexpr uint32_t cBlockSize = 16;
43 |         static constexpr float cAlpha = 1.0f;
44 |         static constexpr float cBeta = 0.0f;
45 | 
46 |         float mHostAdata[cMatrixSize];
47 |         float mHostBdata[cMatrixSize];
48 | 
49 |         float* mDeviceAdata;
50 |         float* mDeviceBdata;
51 |         float* mDeviceCdata;
52 | 
53 |         bool mRunKernel;
54 |         int mNumIterations;
55 | 
56 |         std::unique_ptr<std::thread> mBurnThread;
57 | 
58 |         int bindHipDevice();
59 |         int threadMain();
60 |         int runComputeKernel();
61 |         size_t getAvailableMemory();
62 | 
63 | };
64 | 
65 | }; // namespace gpuburn
66 | 
67 | // ---------------------------------------------------------------------------
68 | 
69 | #endif // GPUBURN_BURNKERNEL_H_
70 | 


--------------------------------------------------------------------------------
/gpu-burn/GpuMonitor.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Public domain.  No warranty.
 3 |  * Ville Timonen 2013
 4 |  * edited by Timmy Liu for HIP API 01/2016
 5 |  */
 6 | 
 7 | #ifndef GPUBURN_GPUMONITOR_H_
 8 | #define GPUBURN_GPUMONITOR_H_
 9 | 
10 | // ---------------------------------------------------------------------------
11 | namespace gpuburn {
12 | 
13 | /**
14 |  * The GpuMonitor provides a generic interface to access common
15 |  * GPU hardware data
16 |  */
17 | class GpuMonitor {
18 |     public:
19 |         virtual ~GpuMonitor() {};
20 | 
21 | 
22 |         /**
23 |          * Retreive the current temperature in degrees Celcius
24 |          * for this device.
25 |          */
26 |         virtual float getTemperature() = 0;
27 | 
28 |         /**
29 |          * Retreive the current temperature in degrees Celcius
30 |          * for this device.
31 |          */
32 |         virtual int getId() { return mId; }
33 | 
34 |     protected:
35 |         GpuMonitor(int id) : mId(id) {};
36 | 
37 |     private:
38 |         int mId;
39 | };
40 | 
41 | }; // namespace gpuburn
42 | 
43 | // ---------------------------------------------------------------------------
44 | 
45 | #endif // GPUBURN_GPUMONITOR_H_
46 | 


--------------------------------------------------------------------------------
/gpu-burn/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | 
 3 | HIP_PLATFORM = $(shell $(HIP_PATH)/bin/hipconfig --platform)
 4 | 
 5 | HIP_INCLUDE = -I${HIP_PATH}/../include
 6 | 
 7 | BUILD_DIR ?= build
 8 | 
 9 | HIPCC = $(HIP_PATH)/bin/hipcc
10 | CPPFLAGS = -O3
11 | LDFLAGS = -lm -lpthread
12 | 
13 | ifeq (${HIP_PLATFORM}, nvcc)
14 |     CPPFLAGS += -arch=compute_20
15 | endif
16 | 
17 | GPUBURN_SRC = $(wildcard *.cpp)
18 | GPUBURN_OBJ = $(addprefix ${BUILD_DIR}/,$(subst .cpp,.o, $(GPUBURN_SRC)))
19 | GPUBURN_BIN = ${BUILD_DIR}/gpuburn-hip
20 | 
21 | .PHONY: all clean run itburn
22 | 
23 | all: ${GPUBURN_BIN}
24 | 
25 | ${GPUBURN_BIN}: ${GPUBURN_OBJ}
26 | 	${HIPCC} ${LDFLAGS} -o ${GPUBURN_BIN} ${GPUBURN_OBJ}
27 | 
28 | ${BUILD_DIR}/%.o: %.cpp Makefile
29 | 	mkdir -p ${BUILD_DIR}
30 | 	${HIPCC} ${HIP_INCLUDE} ${CPPFLAGS} -c -o $@ $<  
31 | 
32 | run: itburn
33 | itburn:
34 | 	HCC_LAZYINIT=ON ${GPUBURN_BIN}
35 | 
36 | clean:
37 | 	rm -rf ${BUILD_DIR}
38 | 


--------------------------------------------------------------------------------
/gpu-burn/common.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Public domain.  No warranty.
 3 |  * Ville Timonen 2013
 4 |  * edited by Timmy Liu for HIP API 01/2016
 5 |  */
 6 | 
 7 | #include "hip/hip_runtime.h"
 8 | #include "common.h"
 9 | 
10 | // ---------------------------------------------------------------------------
11 | namespace gpuburn {
12 | 
13 | int checkError(hipError_t err, std::string desc)
14 | {
15 |     if (err == hipSuccess)
16 |         return 0;
17 | 
18 |     std::string errStr = hipGetErrorString(err);
19 |     std::string errorMessage = "";
20 |     if (desc == "")
21 |         throw "Error: " + errStr + "\n";
22 |     else
23 |         throw "Error in \"" + desc + "\": " + errStr + "\n";
24 | 
25 |     return err;
26 | }
27 | 
28 | }; // namespace common
29 | 
30 | // ---------------------------------------------------------------------------
31 | 


--------------------------------------------------------------------------------
/gpu-burn/common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Public domain.  No warranty.
 3 |  * Ville Timonen 2013
 4 |  * edited by Timmy Liu for HIP API 01/2016
 5 |  */
 6 | 
 7 | #ifndef GPUBURN_COMMON_H_
 8 | #define GPUBURN_COMMON_H_
 9 | 
10 | // ---------------------------------------------------------------------------
11 | namespace gpuburn {
12 | 
13 | /**
14 |  * c++11 doesn't support make_unique, which is very convenient
15 |  *  Refer to: https://herbsutter.com/gotw/_102/
16 |  */
17 | template<typename T, typename... Args>
18 | std::unique_ptr<T> make_unique(Args&&... args)
19 | {
20 |     return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
21 | }
22 | 
23 | int checkError(hipError_t err, std::string desc = "");
24 | 
25 | }; // namespace common
26 | 
27 | // ---------------------------------------------------------------------------
28 | 
29 | #endif // GPUBURN_COMMON_H_
30 | 


--------------------------------------------------------------------------------
/gpu-burn/gpuburn.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Public domain.  No warranty.
  3 |  * Ville Timonen 2013
  4 |  * edited by Timmy Liu for HIP API 01/2016
  5 |  */
  6 | #include <iostream>
  7 | #include <sstream>
  8 | #include <fstream>
  9 | #include <memory>
 10 | #include <vector>
 11 | #include <unistd.h>
 12 | #include <sys/stat.h>
 13 | #include "hip/hip_runtime.h"
 14 | 
 15 | #include "common.h"
 16 | #include "AmdGpuMonitor.h"
 17 | #include "BurnKernel.h"
 18 | 
 19 | // ---------------------------------------------------------------------------
 20 | using namespace gpuburn;
 21 | 
 22 | std::vector<std::unique_ptr<BurnKernel>> genBurnKernels()
 23 | {
 24 |     int deviceCount = 0;
 25 |     std::vector<std::unique_ptr<BurnKernel>> kernels;
 26 | 
 27 |     try {
 28 |         checkError(hipGetDeviceCount(&deviceCount));
 29 |         std::cout<<"Total no. of GPUs found: "<<deviceCount<<std::endl;
 30 |     } catch (std::string e) {
 31 |         std::cerr << "Error: couldn't find any HIP devices\n";
 32 |     }
 33 | 
 34 |     for (int i =0; i < deviceCount; ++i) {
 35 |         try {
 36 |             std::unique_ptr<BurnKernel> kernel(new BurnKernel(i));
 37 |             kernel->Init();
 38 |             kernels.push_back(std::move(kernel));
 39 |         } catch (std::string e) {
 40 |             std::cerr << e;
 41 |             std::cerr << "Error: failed to initialize hip device " << i << "\n";
 42 |         }
 43 |     }
 44 | 
 45 |     return kernels;
 46 | }
 47 | 
 48 | std::vector<std::unique_ptr<GpuMonitor>> genGpuMonitors()
 49 | {
 50 |     int deviceCount = 0;
 51 |     std::vector<std::unique_ptr<GpuMonitor>> monitors;
 52 | 
 53 |     for (int i = 0; true; i++) {
 54 |             struct stat dirInfo;
 55 |             std::string hwmonDir = "/sys/class/hwmon/hwmon" + std::to_string(i);
 56 | 
 57 |             if (stat(hwmonDir.c_str(), &dirInfo))
 58 |                 break;
 59 | 
 60 |             std::string hwmonName;
 61 |             std::ifstream hwmon(hwmonDir + "/name");
 62 | 
 63 |             if (!hwmon.good())
 64 |                 continue;
 65 | 
 66 |             hwmon >> hwmonName;
 67 |             if (hwmonName == "amdgpu") {
 68 |                 GpuMonitor* monitor = new AmdGpuMonitor(i, "/sys/class/hwmon/hwmon" + std::to_string(i));
 69 |                 std::unique_ptr<GpuMonitor> uniq_monitor(monitor);
 70 |                 monitors.push_back(std::move(uniq_monitor));
 71 |             }
 72 |     }
 73 | 
 74 |     return monitors;
 75 | }
 76 | 
 77 | int doBurn(int burnSec) {
 78 |     std::vector<std::unique_ptr<BurnKernel>> burnKernels = genBurnKernels();
 79 |     std::vector<std::unique_ptr<GpuMonitor>> gpuMonitors = genGpuMonitors();
 80 | 
 81 |     if (burnKernels.size() == 0)
 82 |         return -ENOENT;
 83 | 
 84 |     for (auto& kernel : burnKernels) {
 85 |         kernel->startBurn();
 86 |     }
 87 | 
 88 |     for (; burnSec > 0; --burnSec) {
 89 |         std::ostringstream msg;
 90 |         msg << "Temps: ";
 91 |         for (auto& monitor : gpuMonitors) {
 92 |             msg << "[GPU" << monitor->getId() << ": " << monitor->getTemperature() << " C] ";
 93 |         }
 94 |         msg << burnSec << "s\n";
 95 |         std::cout << msg.str();
 96 |         sleep(1);
 97 |     }
 98 | 
 99 |     for (auto& kernel : burnKernels) {
100 |         kernel->stopBurn();
101 |     }
102 | 
103 |     return 0;
104 | }
105 | 
106 | int main(int argc, char **argv) {
107 |     int opt;
108 |     int burnSec = 10;
109 | 
110 |     while ((opt = getopt (argc, argv, "ht:")) != -1)
111 |         switch (opt)
112 |         {
113 |             case 't':
114 |                 burnSec = atoi(optarg);
115 |                 break;
116 |             case 'h':
117 |             default:
118 |                 std::cerr << "Usage: " << argv[0] << " [-t sec]\n";
119 |                 return -EINVAL;
120 |         }
121 | 
122 |     return doBurn(burnSec);
123 | }
124 | 
125 | 
126 | // ---------------------------------------------------------------------------
127 | 


--------------------------------------------------------------------------------
/mini-nbody/README.md:
--------------------------------------------------------------------------------
 1 | mini-nbody: A simple N-body Code
 2 | ================================
 3 | 
 4 | A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations.
 5 | 
 6 | Benchmarks
 7 | ----------
 8 | 
 9 | There are 5 different benchmarks provided for CUDA and MIC platforms.
10 | 
11 | 1. nbody-orig : the original, unoptimized simulation (also for CPU)
12 | 2. nbody-soa  : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout
13 | 3. nbody-flush : Flush denormals to zero (no code changes, just a command line option)
14 | 4. nbody-block : Cache blocking
15 | 5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC)
16 | 
17 | Files
18 | -----
19 | 
20 | nbody.c : simple, unoptimized OpenMP C code
21 | timer.h : simple cross-OS timing code
22 | 
23 | Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies.
24 | 
25 | cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU)
26 |   1. nbody-orig.cu   : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh)
27 |   2. nbody-soa.cu    : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh)
28 |   3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh)
29 |   4. nbody-block.cu  : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh)
30 |   5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh)
31 | 
32 | HIP/ : folder containing HIP optimized versions of the original C code (in order of performance on FIJI NANO)
33 |   1. nbody-orig.cpp   : a straight port of the code to HIP (HIP-nbody-orig.sh)
34 |   2. nbody-soa.cpp    : conversion to structure of arrays (SOA) data layout (HIP-nbody-soa.sh)
35 |   3. nbody-block.cu  : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh)
36 | 
37 | 
38 | 
39 | mic/  : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P)
40 |   1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh)
41 |   2. nbody-soa.c     : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh)
42 |   3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh)
43 |   4. nbody-block.c   : cache blocking via loop splitting (shmoo-mic-nbody-block.sh)
44 |   5. nbody-align.c   : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh)
45 | 
46 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/nbody-block.cu:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "timer.h"
  5 | 
  6 | #define BLOCK_SIZE 256
  7 | #define SOFTENING 1e-9f
  8 | 
  9 | typedef struct { float4 *pos, *vel; } BodySystem;
 10 | 
 11 | void randomizeBodies(float *data, int n) {
 12 |   for (int i = 0; i < n; i++) {
 13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
 14 |   }
 15 | }
 16 | 
 17 | __global__
 18 | void bodyForce(float4 *p, float4 *v, float dt, int n) {
 19 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 20 |   if (i < n) {
 21 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
 22 | 
 23 |     for (int tile = 0; tile < gridDim.x; tile++) {
 24 |       __shared__ float3 spos[BLOCK_SIZE];
 25 |       float4 tpos = p[tile * blockDim.x + threadIdx.x];
 26 |       spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
 27 |       __syncthreads();
 28 | 
 29 |       for (int j = 0; j < BLOCK_SIZE; j++) {
 30 |         float dx = spos[j].x - p[i].x;
 31 |         float dy = spos[j].y - p[i].y;
 32 |         float dz = spos[j].z - p[i].z;
 33 |         float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
 34 |         float invDist = rsqrtf(distSqr);
 35 |         float invDist3 = invDist * invDist * invDist;
 36 | 
 37 |         Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
 38 |       }
 39 |       __syncthreads();
 40 |     }
 41 | 
 42 |     v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
 43 |   }
 44 | }
 45 | 
 46 | int main(const int argc, const char** argv) {
 47 |   
 48 |   int nBodies = 30000;
 49 |   if (argc > 1) nBodies = atoi(argv[1]);
 50 |   
 51 |   const float dt = 0.01f; // time step
 52 |   const int nIters = 10;  // simulation iterations
 53 |   
 54 |   int bytes = 2*nBodies*sizeof(float4);
 55 |   float *buf = (float*)malloc(bytes);
 56 |   BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
 57 | 
 58 |   randomizeBodies(buf, 8*nBodies); // Init pos / vel data
 59 | 
 60 |   float *d_buf;
 61 |   cudaMalloc(&d_buf, bytes);
 62 |   BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
 63 | 
 64 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
 65 |   double totalTime = 0.0; 
 66 | 
 67 |   for (int iter = 1; iter <= nIters; iter++) {
 68 |     StartTimer();
 69 | 
 70 |     cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
 71 |     bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
 72 |     cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
 73 | 
 74 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
 75 |       p.pos[i].x += p.vel[i].x*dt;
 76 |       p.pos[i].y += p.vel[i].y*dt;
 77 |       p.pos[i].z += p.vel[i].z*dt;
 78 |     }
 79 | 
 80 |     const double tElapsed = GetTimer() / 1000.0;
 81 |     if (iter > 1) { // First iter is warm up
 82 |       totalTime += tElapsed; 
 83 |     }
 84 | #ifndef SHMOO
 85 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
 86 | #endif
 87 |   }
 88 |   double avgTime = totalTime / (double)(nIters-1); 
 89 | 
 90 | #ifdef SHMOO
 91 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 92 | #else
 93 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
 94 |          nIters, rate);
 95 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 96 | #endif
 97 |   free(buf);
 98 |   cudaFree(d_buf);
 99 | }
100 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/nbody-orig.cu:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include "timer.h"
 5 | 
 6 | #define BLOCK_SIZE 256
 7 | #define SOFTENING 1e-9f
 8 | 
 9 | typedef struct { float x, y, z, vx, vy, vz; } Body;
10 | 
11 | void randomizeBodies(float *data, int n) {
12 |   for (int i = 0; i < n; i++) {
13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
14 |   }
15 | }
16 | 
17 | __global__
18 | void bodyForce(Body *p, float dt, int n) {
19 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
20 |   if (i < n) {
21 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
22 | 
23 |     for (int j = 0; j < n; j++) {
24 |       float dx = p[j].x - p[i].x;
25 |       float dy = p[j].y - p[i].y;
26 |       float dz = p[j].z - p[i].z;
27 |       float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
28 |       float invDist = rsqrtf(distSqr);
29 |       float invDist3 = invDist * invDist * invDist;
30 | 
31 |       Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
32 |     }
33 | 
34 |     p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
35 |   }
36 | }
37 | 
38 | int main(const int argc, const char** argv) {
39 |   
40 |   int nBodies = 30000;
41 |   if (argc > 1) nBodies = atoi(argv[1]);
42 |   
43 |   const float dt = 0.01f; // time step
44 |   const int nIters = 10;  // simulation iterations
45 | 
46 |   int bytes = nBodies*sizeof(Body);
47 |   float *buf = (float*)malloc(bytes);
48 |   Body *p = (Body*)buf;
49 | 
50 |   randomizeBodies(buf, 6*nBodies); // Init pos / vel data
51 | 
52 |   float *d_buf;
53 |   cudaMalloc(&d_buf, bytes);
54 |   Body *d_p = (Body*)d_buf;
55 | 
56 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
57 |   double totalTime = 0.0; 
58 | 
59 |   for (int iter = 1; iter <= nIters; iter++) {
60 |     StartTimer();
61 | 
62 |     cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
63 |     bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
64 |     cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
65 | 
66 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
67 |       p[i].x += p[i].vx*dt;
68 |       p[i].y += p[i].vy*dt;
69 |       p[i].z += p[i].vz*dt;
70 |     }
71 | 
72 |     const double tElapsed = GetTimer() / 1000.0;
73 |     if (iter > 1) { // First iter is warm up
74 |       totalTime += tElapsed; 
75 |     }
76 | #ifndef SHMOO
77 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
78 | #endif
79 |   }
80 |   double avgTime = totalTime / (double)(nIters-1); 
81 | 
82 | #ifdef SHMOO
83 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
84 | #else
85 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
86 |          nIters, rate);
87 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
88 | #endif
89 |   free(buf);
90 |   cudaFree(d_buf);
91 | }
92 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/nbody-soa.cu:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include "timer.h"
 5 | 
 6 | #define BLOCK_SIZE 256
 7 | #define SOFTENING 1e-9f
 8 | 
 9 | typedef struct { float4 *pos, *vel; } BodySystem;
10 | 
11 | void randomizeBodies(float *data, int n) {
12 |   for (int i = 0; i < n; i++) {
13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
14 |   }
15 | }
16 | 
17 | __global__
18 | void bodyForce(float4 *p, float4 *v, float dt, int n) {
19 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
20 |   if (i < n) {
21 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
22 | 
23 |     for (int j = 0; j < n; j++) {
24 |       float dx = p[j].x - p[i].x;
25 |       float dy = p[j].y - p[i].y;
26 |       float dz = p[j].z - p[i].z;
27 |       float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
28 |       float invDist = rsqrtf(distSqr);
29 |       float invDist3 = invDist * invDist * invDist;
30 | 
31 |       Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
32 |     }
33 | 
34 |     v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
35 |   }
36 | }
37 | 
38 | int main(const int argc, const char** argv) {
39 |   
40 |   int nBodies = 30000;
41 |   if (argc > 1) nBodies = atoi(argv[1]);
42 |   
43 |   const float dt = 0.01f; // time step
44 |   const int nIters = 10;  // simulation iterations
45 |   
46 |   int bytes = 2*nBodies*sizeof(float4);
47 |   float *buf = (float*)malloc(bytes);
48 |   BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
49 | 
50 |   randomizeBodies(buf, 8*nBodies); // Init pos / vel data
51 | 
52 |   float *d_buf;
53 |   cudaMalloc(&d_buf, bytes);
54 |   BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
55 | 
56 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
57 |   double totalTime = 0.0; 
58 | 
59 |   for (int iter = 1; iter <= nIters; iter++) {
60 |     StartTimer();
61 | 
62 |     cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
63 |     bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
64 |     cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
65 | 
66 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
67 |       p.pos[i].x += p.vel[i].x*dt;
68 |       p.pos[i].y += p.vel[i].y*dt;
69 |       p.pos[i].z += p.vel[i].z*dt;
70 |     }
71 | 
72 |     const double tElapsed = GetTimer() / 1000.0;
73 |     if (iter > 1) { // First iter is warm up
74 |       totalTime += tElapsed; 
75 |     }
76 | #ifndef SHMOO
77 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
78 | #endif
79 |   }
80 |   double avgTime = totalTime / (double)(nIters-1); 
81 | 
82 | #ifdef SHMOO
83 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
84 | #else
85 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
86 |          nIters, rate);
87 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
88 | #endif
89 |   free(buf);
90 |   cudaFree(d_buf);
91 | }
92 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/nbody-unroll.cu:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "timer.h"
  5 | 
  6 | #define BLOCK_SIZE 256
  7 | #define SOFTENING 1e-9f
  8 | 
  9 | typedef struct { float4 *pos, *vel; } BodySystem;
 10 | 
 11 | void randomizeBodies(float *data, int n) {
 12 |   for (int i = 0; i < n; i++) {
 13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
 14 |   }
 15 | }
 16 | 
 17 | __global__
 18 | void bodyForce(float4 *p, float4 *v, float dt, int n) {
 19 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 20 |   if (i < n) {
 21 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
 22 | 
 23 |     for (int tile = 0; tile < gridDim.x; tile++) {
 24 |       __shared__ float3 spos[BLOCK_SIZE];
 25 |       float4 tpos = p[tile * blockDim.x + threadIdx.x];
 26 |       spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
 27 |       __syncthreads();
 28 | 
 29 |       #pragma unroll
 30 |       for (int j = 0; j < BLOCK_SIZE; j++) {
 31 |         float dx = spos[j].x - p[i].x;
 32 |         float dy = spos[j].y - p[i].y;
 33 |         float dz = spos[j].z - p[i].z;
 34 |         float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
 35 |         float invDist = rsqrtf(distSqr);
 36 |         float invDist3 = invDist * invDist * invDist;
 37 | 
 38 |         Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
 39 |       }
 40 |       __syncthreads();
 41 |     }
 42 | 
 43 |     v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
 44 |   }
 45 | }
 46 | 
 47 | int main(const int argc, const char** argv) {
 48 |   
 49 |   int nBodies = 30000;
 50 |   if (argc > 1) nBodies = atoi(argv[1]);
 51 |   
 52 |   const float dt = 0.01f; // time step
 53 |   const int nIters = 10;  // simulation iterations
 54 |   
 55 |   int bytes = 2*nBodies*sizeof(float4);
 56 |   float *buf = (float*)malloc(bytes);
 57 |   BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
 58 | 
 59 |   randomizeBodies(buf, 8*nBodies); // Init pos / vel data
 60 | 
 61 |   float *d_buf;
 62 |   cudaMalloc(&d_buf, bytes);
 63 |   BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
 64 | 
 65 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
 66 |   double totalTime = 0.0; 
 67 | 
 68 |   for (int iter = 1; iter <= nIters; iter++) {
 69 |     StartTimer();
 70 | 
 71 |     cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
 72 |     bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
 73 |     cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
 74 | 
 75 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
 76 |       p.pos[i].x += p.vel[i].x*dt;
 77 |       p.pos[i].y += p.vel[i].y*dt;
 78 |       p.pos[i].z += p.vel[i].z*dt;
 79 |     }
 80 | 
 81 |     const double tElapsed = GetTimer() / 1000.0;
 82 |     if (iter > 1) { // First iter is warm up
 83 |       totalTime += tElapsed; 
 84 |     }
 85 | #ifndef SHMOO
 86 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
 87 | #endif
 88 |   }
 89 |   double avgTime = totalTime / (double)(nIters-1); 
 90 | 
 91 | #ifdef SHMOO
 92 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 93 | #else
 94 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
 95 |          nIters, rate);
 96 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 97 | #endif
 98 |   free(buf);
 99 |   cudaFree(d_buf);
100 | }
101 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/shmoo-cuda-nbody-block.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-block.cu
 2 | EXE=nbody-block
 3 | 
 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO
 5 | 
 6 | echo $EXE
 7 | 
 8 | K=1024
 9 | for i in {1..10}
10 | do
11 |     ./$EXE $K
12 |     K=$(($K*2))
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/shmoo-cuda-nbody-ftz.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-soa.cu
 2 | EXE=nbody-ftz
 3 | 
 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO
 5 | 
 6 | echo $EXE
 7 | 
 8 | K=1024
 9 | for i in {1..10}
10 | do
11 |     ./$EXE $K
12 |     K=$(($K*2))
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/shmoo-cuda-nbody-orig.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-orig.cu
 2 | EXE=nbody-orig
 3 | 
 4 | nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC
 5 | 
 6 | echo $EXE
 7 | 
 8 | K=1024
 9 | for i in {1..10}
10 | do
11 |     ./$EXE $K
12 |     K=$(($K*2))
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/shmoo-cuda-nbody-soa.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-soa.cu
 2 | EXE=nbody-soa
 3 | 
 4 | nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC
 5 | 
 6 | echo $EXE
 7 | 
 8 | K=1024
 9 | for i in {1..10}
10 | do
11 |     ./$EXE $K
12 |     K=$(($K*2))
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/mini-nbody/cuda/shmoo-cuda-nbody-unroll.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-unroll.cu
 2 | EXE=nbody-unroll
 3 | 
 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO
 5 | 
 6 | echo $EXE
 7 | 
 8 | K=1024
 9 | for i in {1..10}
10 | do
11 |     ./$EXE $K
12 |     K=$(($K*2))
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/mini-nbody/hip/HIP-nbody-block.sh:
--------------------------------------------------------------------------------
 1 | #Hipify the blocked cuda source code to hip compatible code
 2 | #hipify nbody-block.cu > nbody-block.cpp
 3 | #Manually add the first argument onto the kernel argument list
 4 | #void bodyForce(Body *p, float dt, int n) //before modification
 5 | #void bodyForce(hipLaunchParm lp, Body *p, float dt, int n) //after modification
 6 | 
 7 | #compile the hipified source code into executable 
 8 | if [ -f nbody-block ]
 9 | then
10 |     rm nbody-block
11 | fi
12 | 
13 | if [ -z  "$HIP_PATH" ]
14 | then
15 | 
16 | if [ -d /opt/rocm/hip ]
17 | then
18 |     HIP_PATH=/opt/rocm/hip
19 | else
20 |     HIP_PATH=/opt/rocm
21 | fi
22 | 
23 | fi
24 | 
25 | echo hipcc -I../ -DSHMOO nbody-block.cpp -o nbody-block
26 | $HIP_PATH/bin/hipcc -I../ -DSHMOO nbody-block.cpp -o nbody-block
27 | 
28 | #To print our more details, remove DSHMOO flag
29 | #hipcc -I../  nbody-block.cpp -o nbody-block
30 | 
31 | #execute the program
32 | EXE=nbody-block
33 | K=1024
34 | for i in {1..8}
35 | do
36 |     echo ./$EXE $K
37 |     ./$EXE $K
38 |     K=$(($K*2))
39 | done
40 | 
41 | 


--------------------------------------------------------------------------------
/mini-nbody/hip/HIP-nbody-orig.sh:
--------------------------------------------------------------------------------
 1 | #Hipify the original cuda source code to hip compatible code
 2 | #hipify nbody-orig.cu > nbody-orig.cpp
 3 | 
 4 | #compile the hipified source code into executable 
 5 | if [ -f nbody-orig ]
 6 | then
 7 |     rm nbody-orig
 8 | fi
 9 | 
10 | if [ -z  "$HIP_PATH" ]
11 | then
12 | 
13 | if [ -d /opt/rocm/hip ]
14 | then
15 |     HIP_PATH=/opt/rocm/hip
16 | else
17 |     HIP_PATH=/opt/rocm
18 | fi
19 | 
20 | fi
21 | 
22 | echo hipcc -I../ -DSHMOO nbody-orig.cpp -o nbody-orig
23 | $HIP_PATH/bin/hipcc -I../ -DSHMOO nbody-orig.cpp -o nbody-orig
24 | 
25 | #To print our more details, remove  flag
26 | #hipcc -I../  nbody-orig.cpp -o nbody-orig
27 | 
28 | #execute the program
29 | 
30 | EXE=nbody-orig
31 | K=1024
32 | for i in {1..10}
33 | do
34 |     echo ./$EXE $K
35 |     ./$EXE $K
36 |     K=$(($K*2))
37 | done
38 | 
39 | 


--------------------------------------------------------------------------------
/mini-nbody/hip/HIP-nbody-soa.sh:
--------------------------------------------------------------------------------
 1 | #Hipify the soa cuda source code to hip compatible code
 2 | #hipify nbody-soa.cu > nbody-soa.cpp
 3 | #Manually add the first argument onto the kernel argument list
 4 | #void bodyForce(Body *p, float dt, int n) //before modification
 5 | #void bodyForce(hipLaunchParm lp, Body *p, float dt, int n) //after modification
 6 | 
 7 | #compile the hipified source code into executable 
 8 | if [ -f nbody-soa ]
 9 | then
10 |     rm nbody-soa
11 | fi
12 | 
13 | if [ -z  "$HIP_PATH" ]
14 | then
15 | 
16 | if [ -d /opt/rocm/hip ]
17 | then
18 |     HIP_PATH=/opt/rocm/hip
19 | else
20 |     HIP_PATH=/opt/rocm
21 | fi
22 | 
23 | fi
24 | 
25 | echo hipcc -I../ -DSHMOO nbody-soa.cpp -o nbody-soa
26 | $HIP_PATH/bin/hipcc -I../ -DSHMOO nbody-soa.cpp -o nbody-soa
27 | 
28 | #To print our more details, remove DSHMOO flag
29 | #hipcc -I../  nbody-soa.cpp -o nbody-soa
30 | 
31 | #execute the program
32 | EXE=nbody-soa
33 | K=1024
34 | for i in {1..8}
35 | do
36 |     echo ./$EXE $K
37 |     ./$EXE $K
38 |     K=$(($K*2))
39 | done
40 | 
41 | 


--------------------------------------------------------------------------------
/mini-nbody/hip/nbody-block.cpp:
--------------------------------------------------------------------------------
  1 | #include "hip/hip_runtime.h"
  2 | #include <math.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include "timer.h"
  6 | 
  7 | #define BLOCK_SIZE 256
  8 | #define SOFTENING 1e-9f
  9 | 
 10 | typedef struct { float4 *pos, *vel; } BodySystem;
 11 | 
 12 | void randomizeBodies(float *data, int n) {
 13 |   for (int i = 0; i < n; i++) {
 14 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
 15 |   }
 16 | }
 17 | 
 18 | __global__
 19 | void bodyForce(float4 *p, float4 *v, float dt, int n) {
 20 |   int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 21 |   if (i < n) {
 22 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
 23 | 
 24 |     for (int tile = 0; tile < hipGridDim_x; tile++) {
 25 |       __shared__ float3 spos[BLOCK_SIZE];
 26 |       float4 tpos = p[tile * hipBlockDim_x + hipThreadIdx_x];
 27 |       spos[hipThreadIdx_x] = make_float3(tpos.x, tpos.y, tpos.z);
 28 |       __syncthreads();
 29 | 
 30 |       for (int j = 0; j < BLOCK_SIZE; j++) {
 31 |         float dx = spos[j].x - p[i].x;
 32 |         float dy = spos[j].y - p[i].y;
 33 |         float dz = spos[j].z - p[i].z;
 34 |         float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
 35 |         float invDist = 1.0f / sqrtf(distSqr);
 36 |         float invDist3 = invDist * invDist * invDist;
 37 | 
 38 |         Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
 39 |       }
 40 |       __syncthreads();
 41 |     }
 42 | 
 43 |     v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
 44 |   }
 45 | }
 46 | 
 47 | int main(const int argc, const char** argv) {
 48 | 
 49 |   int nBodies = 30000;
 50 |   if (argc > 1) nBodies = atoi(argv[1]);
 51 | 
 52 |   const float dt = 0.01f; // time step
 53 |   const int nIters = 10;  // simulation iterations
 54 | 
 55 |   int bytes = 2*nBodies*sizeof(float4);
 56 |   float *buf = (float*)malloc(bytes);
 57 |   BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
 58 | 
 59 |   randomizeBodies(buf, 8*nBodies); // Init pos / vel data
 60 | 
 61 |   float *d_buf;
 62 |   hipMalloc(&d_buf, bytes);
 63 |   BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
 64 | 
 65 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
 66 |   double totalTime = 0.0;
 67 | 
 68 |   for (int iter = 1; iter <= nIters; iter++) {
 69 |     StartTimer();
 70 | 
 71 |     hipMemcpy(d_buf, buf, bytes, hipMemcpyHostToDevice);
 72 |     hipLaunchKernelGGL(bodyForce, dim3(nBlocks), dim3(BLOCK_SIZE), 0, 0, d_p.pos, d_p.vel, dt, nBodies);
 73 |     hipMemcpy(buf, d_buf, bytes, hipMemcpyDeviceToHost);
 74 | 
 75 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
 76 |       p.pos[i].x += p.vel[i].x*dt;
 77 |       p.pos[i].y += p.vel[i].y*dt;
 78 |       p.pos[i].z += p.vel[i].z*dt;
 79 |     }
 80 | 
 81 |     const double tElapsed = GetTimer() / 1000.0;
 82 |     if (iter > 1) { // First iter is warm up
 83 |       totalTime += tElapsed;
 84 |     }
 85 | #ifndef SHMOO
 86 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
 87 | #endif
 88 |   }
 89 |   double avgTime = totalTime / (double)(nIters-1);
 90 | 
 91 | #ifdef SHMOO
 92 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 93 | #else
 94 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
 95 |          nIters, rate);
 96 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 97 | #endif
 98 |   free(buf);
 99 |   hipFree(d_buf);
100 | }
101 | 


--------------------------------------------------------------------------------
/mini-nbody/hip/nbody-orig.cpp:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include "timer.h"
 6 | 
 7 | #define BLOCK_SIZE 256
 8 | #define SOFTENING 1e-9f
 9 | 
10 | typedef struct { float x, y, z, vx, vy, vz; } Body;
11 | 
12 | void randomizeBodies(float *data, int n) {
13 |   for (int i = 0; i < n; i++) {
14 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
15 |   }
16 | }
17 | 
18 | //inline float rsqrtf(float x){
19 |     //return 1.0f / sqrtf(x);
20 | //}//host implementation of cuda function for rsqrtf
21 | 
22 | __global__
23 | void bodyForce(Body *p, float dt, int n) {
24 |   int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
25 |   if (i < n) {
26 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
27 | 
28 |     for (int j = 0; j < n; j++) {
29 |       float dx = p[j].x - p[i].x;
30 |       float dy = p[j].y - p[i].y;
31 |       float dz = p[j].z - p[i].z;
32 |       float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
33 |       float invDist = 1.0f / sqrtf(distSqr);
34 |       //float invDist = rsqrtf(distSqr);
35 |       float invDist3 = invDist * invDist * invDist;
36 | 
37 |       Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
38 |     }
39 | 
40 |     p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
41 |   }
42 | }
43 | 
44 | int main(const int argc, const char** argv) {
45 | 
46 |   int nBodies = 30000;
47 |   if (argc > 1) nBodies = atoi(argv[1]);
48 | 
49 |   const float dt = 0.01f; // time step
50 |   const int nIters = 10;  // simulation iterations
51 | 
52 |   int bytes = nBodies*sizeof(Body);
53 |   float *buf = (float*)malloc(bytes);
54 |   Body *p = (Body*)buf;
55 | 
56 |   randomizeBodies(buf, 6*nBodies); // Init pos / vel data
57 | 
58 |   float *d_buf;
59 |   hipMalloc(&d_buf, bytes);
60 |   Body *d_p = (Body*)d_buf;
61 | 
62 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
63 |   double totalTime = 0.0;
64 | 
65 |   for (int iter = 1; iter <= nIters; iter++) {
66 |     StartTimer();
67 | 
68 |     hipMemcpy(d_buf, buf, bytes, hipMemcpyHostToDevice);
69 |     hipLaunchKernelGGL(bodyForce, dim3(nBlocks), dim3(BLOCK_SIZE), 0, 0, d_p, dt, nBodies); // compute interbody forces
70 |     hipMemcpy(buf, d_buf, bytes, hipMemcpyDeviceToHost);
71 | 
72 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
73 |       p[i].x += p[i].vx*dt;
74 |       p[i].y += p[i].vy*dt;
75 |       p[i].z += p[i].vz*dt;
76 |     }
77 | 
78 |     const double tElapsed = GetTimer() / 1000.0;
79 |     if (iter > 1) { // First iter is warm up
80 |       totalTime += tElapsed;
81 |     }
82 | #ifndef SHMOO
83 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
84 | #endif
85 |   }
86 |   double avgTime = totalTime / (double)(nIters-1);
87 | 
88 | #ifdef SHMOO
89 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
90 | #else
91 |   //printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
92 |          //nIters, rate);
93 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
94 | #endif
95 |   free(buf);
96 |   hipFree(d_buf);
97 | }
98 | 


--------------------------------------------------------------------------------
/mini-nbody/hip/nbody-soa.cpp:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include "timer.h"
 6 | 
 7 | #define BLOCK_SIZE 256
 8 | #define SOFTENING 1e-9f
 9 | 
10 | typedef struct { float4 *pos, *vel; } BodySystem;
11 | 
12 | void randomizeBodies(float *data, int n) {
13 |   for (int i = 0; i < n; i++) {
14 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
15 |   }
16 | }
17 | 
18 | __global__
19 | void bodyForce(float4 *p, float4 *v, float dt, int n) {
20 |   int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
21 |   if (i < n) {
22 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
23 | 
24 |     for (int j = 0; j < n; j++) {
25 |       float dx = p[j].x - p[i].x;
26 |       float dy = p[j].y - p[i].y;
27 |       float dz = p[j].z - p[i].z;
28 |       float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
29 |       /*float invDist = rsqrtf(distSqr);*/
30 |       float invDist = 1.0f / sqrtf(distSqr);
31 |       float invDist3 = invDist * invDist * invDist;
32 | 
33 |       Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
34 |     }
35 | 
36 |     v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
37 |   }
38 | }
39 | 
40 | int main(const int argc, const char** argv) {
41 | 
42 |   int nBodies = 30000;
43 |   if (argc > 1) nBodies = atoi(argv[1]);
44 | 
45 |   const float dt = 0.01f; // time step
46 |   const int nIters = 10;  // simulation iterations
47 | 
48 |   int bytes = 2*nBodies*sizeof(float4);
49 |   float *buf = (float*)malloc(bytes);
50 |   BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
51 | 
52 |   randomizeBodies(buf, 8*nBodies); // Init pos / vel data
53 | 
54 |   float *d_buf;
55 |   hipMalloc(&d_buf, bytes);
56 |   BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
57 | 
58 |   int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
59 |   double totalTime = 0.0;
60 | 
61 |   for (int iter = 1; iter <= nIters; iter++) {
62 |     StartTimer();
63 | 
64 |     hipMemcpy(d_buf, buf, bytes, hipMemcpyHostToDevice);
65 |     hipLaunchKernelGGL(bodyForce, dim3(nBlocks), dim3(BLOCK_SIZE), 0, 0, d_p.pos, d_p.vel, dt, nBodies);
66 |     hipMemcpy(buf, d_buf, bytes, hipMemcpyDeviceToHost);
67 | 
68 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
69 |       p.pos[i].x += p.vel[i].x*dt;
70 |       p.pos[i].y += p.vel[i].y*dt;
71 |       p.pos[i].z += p.vel[i].z*dt;
72 |     }
73 | 
74 |     const double tElapsed = GetTimer() / 1000.0;
75 |     if (iter > 1) { // First iter is warm up
76 |       totalTime += tElapsed;
77 |     }
78 | #ifndef SHMOO
79 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
80 | #endif
81 |   }
82 |   double avgTime = totalTime / (double)(nIters-1);
83 | 
84 | #ifdef SHMOO
85 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
86 | #else
87 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
88 |          nIters, rate);
89 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
90 | #endif
91 |   free(buf);
92 |   hipFree(d_buf);
93 | }
94 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/nbody-align.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "timer.h"
  5 | 
  6 | #define CACHELINE 64 // size of cache line [bytes]
  7 | #define SOFTENING 1e-9f
  8 | 
  9 | typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem;
 10 | 
 11 | void randomizeBodies(float *data, int n) {
 12 |   for (int i = 0; i < n; i++) {
 13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
 14 |   }
 15 | }
 16 | 
 17 | 
 18 | void bodyForce(BodySystem p, float dt, int n, int tileSize) {
 19 |   for (int tile = 0; tile < n; tile += tileSize) {
 20 |     int to = tile + tileSize; 
 21 |     if (to > n) to = n;
 22 | 
 23 |     #pragma omp parallel for schedule(dynamic)
 24 |     for (int i = 0; i < n; i++) {
 25 |       float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
 26 | 
 27 |       #pragma vector aligned
 28 |       #pragma simd
 29 |       for (int j = tile; j < to; j++) {
 30 |         float dy = p.y[j] - p.y[i];
 31 |         float dz = p.z[j] - p.z[i];
 32 |         float dx = p.x[j] - p.x[i];
 33 |         float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
 34 |         float invDist = 1.0f / sqrtf(distSqr);
 35 |         float invDist3 = invDist * invDist * invDist;
 36 | 
 37 |         Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;      
 38 |       }
 39 |     
 40 |       p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz;
 41 |     }
 42 |   }
 43 | }
 44 | 
 45 | int main(const int argc, const char** argv) {
 46 |   
 47 |   int nBodies = 30000;
 48 |   if (argc > 1) nBodies = atoi(argv[1]);
 49 | 
 50 |   int tileSize = 24400;
 51 |   if (tileSize > nBodies) tileSize = nBodies;
 52 | 
 53 |   const float dt = 0.01f; // time step
 54 |   const int nIters = 10;  // simulation iterations
 55 | 
 56 |   if ( tileSize % (CACHELINE/sizeof(float)) ) {
 57 |     printf("ERROR: blockSize not multiple of %d vector elements\n", CACHELINE/(int)sizeof(float));
 58 |     exit(1);
 59 |   }
 60 | 
 61 |   int bytes = 6*nBodies*sizeof(float);
 62 |   float *buf = (float*)_mm_malloc(bytes, CACHELINE);
 63 |   BodySystem p;
 64 |   p.x  = buf+0*nBodies; p.y  = buf+1*nBodies; p.z  = buf+2*nBodies;
 65 |   p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies;
 66 | 
 67 |   randomizeBodies(buf, 6*nBodies); // Init pos / vel data
 68 | 
 69 |   double totalTime = 0.0;
 70 | 
 71 |   for (int iter = 1; iter <= nIters; iter++) {
 72 |     StartTimer();
 73 | 
 74 |     bodyForce(p, dt, nBodies, tileSize); // compute interbody forces
 75 | 
 76 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
 77 |       p.x[i] += p.vx[i]*dt;
 78 |       p.y[i] += p.vy[i]*dt;
 79 |       p.z[i] += p.vz[i]*dt;
 80 |     }
 81 | 
 82 |     const double tElapsed = GetTimer() / 1000.0;
 83 |     if (iter > 1) { // First iter is warm up
 84 |       totalTime += tElapsed; 
 85 |     }
 86 | #ifndef SHMOO
 87 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
 88 | #endif
 89 |   }
 90 |   double avgTime = totalTime / (double)(nIters-1); 
 91 | 
 92 | #ifdef SHMOO
 93 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 94 | #else
 95 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
 96 |          nIters, rate);
 97 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
 98 | #endif
 99 |   _mm_free(buf);
100 | }
101 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/nbody-block.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include "timer.h"
 5 | 
 6 | #define CACHELINE 64 // size of cache line [bytes]
 7 | #define SOFTENING 1e-9f
 8 | 
 9 | typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem;
10 | 
11 | void randomizeBodies(float *data, int n) {
12 |   for (int i = 0; i < n; i++) {
13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
14 |   }
15 | }
16 | 
17 | 
18 | void bodyForce(BodySystem p, float dt, int n, int tileSize) {
19 |   for (int tile = 0; tile < n; tile += tileSize) {
20 |     int to = tile + tileSize; 
21 |     if (to > n) to = n;
22 | 
23 |     #pragma omp parallel for schedule(dynamic)
24 |     for (int i = 0; i < n; i++) {
25 |       float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
26 | 
27 |       for (int j = tile; j < to; j++) {
28 |         float dy = p.y[j] - p.y[i];
29 |         float dz = p.z[j] - p.z[i];
30 |         float dx = p.x[j] - p.x[i];
31 |         float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
32 |         float invDist = 1.0f / sqrtf(distSqr);
33 |         float invDist3 = invDist * invDist * invDist;
34 | 
35 |         Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;      
36 |       }
37 |     
38 |       p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz;
39 |     }
40 |   }
41 | }
42 | 
43 | int main(const int argc, const char** argv) {
44 |   
45 |   int nBodies = 30000;
46 |   if (argc > 1) nBodies = atoi(argv[1]);
47 | 
48 |   int tileSize = 24400;
49 |   if (tileSize > nBodies) tileSize = nBodies;
50 | 
51 |   const float dt = 0.01f; // time step
52 |   const int nIters = 10;  // simulation iterations
53 | 
54 |   int bytes = 6*nBodies*sizeof(float);
55 |   float *buf = (float*)malloc(bytes);
56 |   BodySystem p;
57 |   p.x  = buf+0*nBodies; p.y  = buf+1*nBodies; p.z  = buf+2*nBodies;
58 |   p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies;
59 | 
60 |   randomizeBodies(buf, 6*nBodies); // Init pos / vel data
61 | 
62 |   double totalTime = 0.0;
63 | 
64 |   for (int iter = 1; iter <= nIters; iter++) {
65 |     StartTimer();
66 | 
67 |     bodyForce(p, dt, nBodies, tileSize); // compute interbody forces
68 | 
69 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
70 |       p.x[i] += p.vx[i]*dt;
71 |       p.y[i] += p.vy[i]*dt;
72 |       p.z[i] += p.vz[i]*dt;
73 |     }
74 | 
75 |     const double tElapsed = GetTimer() / 1000.0;
76 |     if (iter > 1) { // First iter is warm up
77 |       totalTime += tElapsed; 
78 |     }
79 | #ifndef SHMOO
80 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
81 | #endif
82 |   }
83 |   double avgTime = totalTime / (double)(nIters-1); 
84 | 
85 | #ifdef SHMOO
86 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
87 | #else
88 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
89 |          nIters, rate);
90 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
91 | #endif
92 |   free(buf);
93 | }
94 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/nbody-soa.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include "timer.h"
 5 | 
 6 | 
 7 | #define SOFTENING 1e-9f
 8 | 
 9 | typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem;
10 | 
11 | void randomizeBodies(float *data, int n) {
12 |   for (int i = 0; i < n; i++) {
13 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
14 |   }
15 | }
16 | 
17 | 
18 | void bodyForce(BodySystem p, float dt, int n) {
19 |   #pragma omp parallel for schedule(dynamic)
20 |   for (int i = 0; i < n; i++) { 
21 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
22 | 
23 |     for (int j = 0; j < n; j++) {
24 |       float dy = p.y[j] - p.y[i];
25 |       float dz = p.z[j] - p.z[i];
26 |       float dx = p.x[j] - p.x[i];
27 |       float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
28 |       float invDist = 1.0f / sqrtf(distSqr);
29 |       float invDist3 = invDist * invDist * invDist;
30 | 
31 |       Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
32 |     }
33 | 
34 |     p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz;
35 |   }
36 | }
37 | 
38 | int main(const int argc, const char** argv) {
39 |   
40 |   int nBodies = 30000;
41 |   if (argc > 1) nBodies = atoi(argv[1]);
42 | 
43 |   const float dt = 0.01f; // time step
44 |   const int nIters = 10;  // simulation iterations
45 | 
46 |   int bytes = 6*nBodies*sizeof(float);
47 |   float *buf = (float*)malloc(bytes);
48 |   BodySystem p;
49 |   p.x  = buf+0*nBodies; p.y  = buf+1*nBodies; p.z  = buf+2*nBodies;
50 |   p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies;
51 | 
52 |   randomizeBodies(buf, 6*nBodies); // Init pos / vel data
53 | 
54 |   double totalTime = 0.0;
55 | 
56 |   for (int iter = 1; iter <= nIters; iter++) {
57 |     StartTimer();
58 | 
59 |     bodyForce(p, dt, nBodies); // compute interbody forces
60 | 
61 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
62 |       p.x[i] += p.vx[i]*dt;
63 |       p.y[i] += p.vy[i]*dt;
64 |       p.z[i] += p.vz[i]*dt;
65 |     }
66 | 
67 |     const double tElapsed = GetTimer() / 1000.0;
68 |     if (iter > 1) { // First iter is warm up
69 |       totalTime += tElapsed; 
70 |     }
71 | #ifndef SHMOO
72 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
73 | #endif
74 |   }
75 |   double avgTime = totalTime / (double)(nIters-1); 
76 | 
77 | #ifdef SHMOO
78 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
79 | #else
80 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
81 |          nIters, rate);
82 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
83 | #endif
84 |   free(buf);
85 | }
86 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/shmoo-mic-nbody-align.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-align.c
 2 | EXE=nbody-align-mic
 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
 4 | MIC=mic0
 5 | if [ $# -gt 0 ]
 6 |   then
 7 |     MIC=$1
 8 | fi
 9 | 
10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC
11 | 
12 | scp $EXE $MIC:~/
13 | scp $MICROOT/libiomp5.so $MIC:~/
14 | 
15 | echo $EXE
16 | 
17 | K=1024
18 | for i in {1..10}
19 | do
20 |     ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
21 |     K=$(($K*2))
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/shmoo-mic-nbody-block.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-block.c
 2 | EXE=nbody-block-mic
 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
 4 | MIC=mic0
 5 | if [ $# -gt 0 ]
 6 |   then
 7 |     MIC=$1
 8 | fi
 9 | 
10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC
11 | 
12 | scp $EXE $MIC:~/
13 | scp $MICROOT/libiomp5.so $MIC:~/
14 | 
15 | echo $EXE
16 | 
17 | K=1024
18 | for i in {1..10}
19 | do
20 |     ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
21 |     K=$(($K*2))
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/shmoo-mic-nbody-ftz.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-soa.c
 2 | EXE=nbody-ftz-mic
 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
 4 | MIC=mic0
 5 | if [ $# -gt 0 ]
 6 |   then
 7 |     MIC=$1
 8 | fi
 9 | 
10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC
11 | 
12 | scp $EXE $MIC:~/
13 | scp $MICROOT/libiomp5.so $MIC:~/
14 | 
15 | echo $EXE
16 | 
17 | K=1024
18 | for i in {1..10}
19 | do
20 |     ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
21 |     K=$(($K*2))
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/shmoo-mic-nbody-orig.sh:
--------------------------------------------------------------------------------
 1 | SRC=../nbody-orig.c
 2 | EXE=nbody-orig-mic
 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
 4 | MIC=mic0
 5 | if [ $# -gt 0 ]
 6 |   then
 7 |     MIC=$1
 8 | fi
 9 | 
10 | icc -std=c99 -openmp -mmic -DSHMOO -o $EXE $SRC
11 | 
12 | scp $EXE $MIC:~/
13 | scp $MICROOT/libiomp5.so $MIC:~/
14 | 
15 | echo $EXE
16 | 
17 | K=1024
18 | for i in {1..10}
19 | do
20 |     ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
21 |     K=$(($K*2))
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/mini-nbody/mic/shmoo-mic-nbody-soa.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody-soa.c
 2 | EXE=nbody-soa-mic
 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
 4 | MIC=mic0
 5 | if [ $# -gt 0 ]
 6 |   then
 7 |     MIC=$1
 8 | fi
 9 | 
10 | icc -std=c99 -openmp -mmic -DSHMOO -I../ -o $EXE $SRC
11 | 
12 | scp $EXE $MIC:~/
13 | scp $MICROOT/libiomp5.so $MIC:~/
14 | 
15 | echo $EXE
16 | 
17 | K=1024
18 | for i in {1..10}
19 | do
20 |     ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
21 |     K=$(($K*2))
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/mini-nbody/nbody.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include "timer.h"
 5 | 
 6 | #define SOFTENING 1e-9f
 7 | 
 8 | typedef struct { float x, y, z, vx, vy, vz; } Body;
 9 | 
10 | void randomizeBodies(float *data, int n) {
11 |   for (int i = 0; i < n; i++) {
12 |     data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
13 |   }
14 | }
15 | 
16 | void bodyForce(Body *p, float dt, int n) {
17 |   #pragma omp parallel for schedule(dynamic)
18 |   for (int i = 0; i < n; i++) { 
19 |     float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
20 | 
21 |     for (int j = 0; j < n; j++) {
22 |       float dx = p[j].x - p[i].x;
23 |       float dy = p[j].y - p[i].y;
24 |       float dz = p[j].z - p[i].z;
25 |       float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
26 |       float invDist = 1.0f / sqrtf(distSqr);
27 |       float invDist3 = invDist * invDist * invDist;
28 | 
29 |       Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
30 |     }
31 | 
32 |     p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
33 |   }
34 | }
35 | 
36 | int main(const int argc, const char** argv) {
37 |   
38 |   int nBodies = 30000;
39 |   if (argc > 1) nBodies = atoi(argv[1]);
40 | 
41 |   const float dt = 0.01f; // time step
42 |   const int nIters = 10;  // simulation iterations
43 | 
44 |   int bytes = nBodies*sizeof(Body);
45 |   float *buf = (float*)malloc(bytes);
46 |   Body *p = (Body*)buf;
47 | 
48 |   randomizeBodies(buf, 6*nBodies); // Init pos / vel data
49 | 
50 |   double totalTime = 0.0;
51 | 
52 |   for (int iter = 1; iter <= nIters; iter++) {
53 |     StartTimer();
54 | 
55 |     bodyForce(p, dt, nBodies); // compute interbody forces
56 | 
57 |     for (int i = 0 ; i < nBodies; i++) { // integrate position
58 |       p[i].x += p[i].vx*dt;
59 |       p[i].y += p[i].vy*dt;
60 |       p[i].z += p[i].vz*dt;
61 |     }
62 | 
63 |     const double tElapsed = GetTimer() / 1000.0;
64 |     if (iter > 1) { // First iter is warm up
65 |       totalTime += tElapsed; 
66 |     }
67 | #ifndef SHMOO
68 |     printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
69 | #endif
70 |   }
71 |   double avgTime = totalTime / (double)(nIters-1); 
72 | 
73 | #ifdef SHMOO
74 |   printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
75 | #else
76 |   printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
77 |          nIters, rate);
78 |   printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
79 | #endif
80 |   free(buf);
81 | }
82 | 


--------------------------------------------------------------------------------
/mini-nbody/shmoo-cpu-nbody.sh:
--------------------------------------------------------------------------------
 1 | SRC=nbody.c
 2 | EXE=nbody
 3 | gcc -std=c99 -O3 -fopenmp -DSHMOO -o $EXE $SRC -lm
 4 | 
 5 | echo $EXE
 6 | 
 7 | K=1024
 8 | for i in {1..10}
 9 | do
10 |     ./$EXE $K
11 |     K=$(($K*2))
12 | done
13 | 
14 | 


--------------------------------------------------------------------------------
/mini-nbody/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_H
 2 | #define TIMER_H
 3 | 
 4 | #include <stdlib.h>
 5 | 
 6 | #ifdef WIN32
 7 |   #define WIN32_LEAN_AND_MEAN
 8 |   #include <windows.h>
 9 | #else
10 |   #ifndef __USE_BSD
11 |     #define __USE_BSD
12 |   #endif
13 |   #include <sys/time.h>
14 | #endif
15 | 
16 | #ifdef WIN32
17 | double PCFreq = 0.0;
18 | __int64 timerStart = 0;
19 | #else
20 | struct timeval timerStart;
21 | #endif
22 | 
23 | void StartTimer()
24 | {
25 | #ifdef WIN32
26 |   LARGE_INTEGER li;
27 |   if(!QueryPerformanceFrequency(&li))
28 |     printf("QueryPerformanceFrequency failed!\n");
29 | 
30 |   PCFreq = (double)li.QuadPart/1000.0;
31 | 
32 |   QueryPerformanceCounter(&li);
33 |   timerStart = li.QuadPart;
34 | #else
35 |   gettimeofday(&timerStart, NULL);
36 | #endif
37 | }
38 | 
39 | // time elapsed in ms
40 | double GetTimer()
41 | {
42 | #ifdef WIN32
43 |   LARGE_INTEGER li;
44 |   QueryPerformanceCounter(&li);
45 |   return (double)(li.QuadPart-timerStart)/PCFreq;
46 | #else
47 |   struct timeval timerStop, timerElapsed;
48 |   gettimeofday(&timerStop, NULL);
49 |   timersub(&timerStop, &timerStart, &timerElapsed);
50 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
51 | #endif
52 | }
53 | 
54 | #endif // TIMER_H
55 | 


--------------------------------------------------------------------------------
/openmp-helloworld/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(openmp_helloworld)
 2 | 
 3 | cmake_minimum_required(VERSION 3.16)
 4 | 
 5 | # Search for rocm in common locations
 6 | if(WIN32)
 7 |   list(APPEND CMAKE_PREFIX_PATH "C:/hip")
 8 |   list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD HIP SDK/hip")
 9 | else()
10 |   list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
11 | endif()
12 | 
13 | # Find HIP.
14 | # The user may override AMDGPU_TARGETS defined in the HIP config file
15 | # to select the AMDGPU archs to compile for.
16 | # ex. set(AMDGPU_TARGETS "gfx803;gfx900;gfx906")
17 | find_package(hip REQUIRED)
18 | 
19 | # Find OpenMP.
20 | find_package(OpenMP REQUIRED)
21 | 
22 | # Set compiler and linker.
23 | if(NOT WIN32)
24 |   set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
25 |   set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
26 | endif()
27 | 
28 | set(CMAKE_BUILD_TYPE Release)
29 | 
30 | if(WIN32)
31 |   # Compile for OpenMP code (Windows requires this).
32 |   set(OpenMP_CXX_FLAGS "-Xclang -fopenmp")
33 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
34 |   # Tell CMake where to find the OpenMP libraries (libomp.lib).
35 |   link_directories("C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\Llvm\\x64\\lib")
36 | endif()
37 | 
38 | # Source files.
39 | set(CPP_SOURCES ${CMAKE_SOURCE_DIR}/openmp_helloworld.cpp)
40 | 
41 | # Preparing the executable.
42 | add_executable(test_openmp_helloworld ${CPP_SOURCES})
43 | 
44 | # Link Libraries - HIP Device and OpenMP.
45 | target_compile_options(test_openmp_helloworld PRIVATE ${OpenMP_CXX_FLAGS})
46 | target_link_libraries(test_openmp_helloworld PRIVATE hip::device ${OpenMP_CXX_FLAGS})
47 | 
48 | if(WIN32)
49 |   target_link_libraries(test_openmp_helloworld PRIVATE OpenMP::OpenMP_CXX)
50 | endif()
51 | 


--------------------------------------------------------------------------------
/openmp-helloworld/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | CXX=$(HIPCC)
 5 | CXXFLAGS =-fopenmp
 6 | 
 7 | SOURCES = openmp_helloworld.cpp
 8 | 
 9 | EXECUTABLE=./openmp_helloworld.exe
10 | 
11 | .PHONY: test
12 | 
13 | 
14 | all: $(EXECUTABLE) test
15 | 
16 | 
17 | $(EXECUTABLE):
18 | 	$(CXX) $(CXXFLAGS) $(SOURCES) -o $@
19 | 
20 | 
21 | test: $(EXECUTABLE)
22 | 	$(EXECUTABLE)
23 | 
24 | 
25 | clean:
26 | 	rm -f $(EXECUTABLE) *.o
27 | 
28 | 


--------------------------------------------------------------------------------
/openmp-helloworld/README.md:
--------------------------------------------------------------------------------
 1 | # Simple OpenMP hello world example written directly to the HIP interface.
 2 | 
 3 | ## Requirements
 4 | * Installed ROCm 3.9 or newer. See  [ROCm Installation Guide](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html).
 5 | 
 6 | ## Windows Requirements
 7 | * Set HIP_DIR to the HIP installation location.
 8 | * libamdhip64.dll and amd_comgr.dll must be in PATH or in System32.
 9 | * Install MS Visual Studio 2019 for C++ development with Optional C++ Clang tools for Windows.
10 | * Ensure libomp.dll from MSVC C++ Clang tools is in PATH (by default, location is C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\Llvm\x64\bin).
11 | * Modify the CMakeLists.txt of this project to the corresponding libomp.lib location.
12 | 
13 | ## How to run this code:
14 | 
15 | ### Using Make on Linux:
16 | * To build and run: `make`.
17 | * To clean the environment: `make clean`.
18 | 
19 | 
20 | ### Using CMake on Linux:
21 | * To build: `mkdir -p build; cd build; cmake ..; make`
22 | * To run the test: `./test_openmp_helloworld`
23 | * To clean the build environment: `make clean`
24 | 
25 | ### Using CMake on Windows:
26 | * CMake Command: `cmake -G Ninja -DCMAKE_C_COMPILER=<HIP_DIR>/bin/clang.exe -DCMAKE_CXX_COMPILER=<HIP_DIR>/bin/clang++.exe`
27 | * To build: `ninja`
28 | * To run the test: `./test_openmp_helloworld`
29 | 
30 | **Note:** You may override `AMDGPU_TARGETS` in the HIP config file by modifying the CMakeLists.txt.
31 | 
32 | ## Expected Results:
33 | ```
34 | info: running on device Device 66a3
35 | Hello World... from OMP thread = 0
36 | Hello World... from OMP thread = 15
37 | Hello World... from OMP thread = 3
38 | Hello World... from OMP thread = 13
39 | Hello World... from OMP thread = 11
40 | Hello World... from OMP thread = 8
41 | Hello World... from OMP thread = 4
42 | Hello World... from OMP thread = 1
43 | Hello World... from OMP thread = 10
44 | Hello World... from OMP thread = 9
45 | Hello World... from OMP thread = 7
46 | Hello World... from OMP thread = 12
47 | Hello World... from OMP thread = 6
48 | Hello World... from OMP thread = 14
49 | Hello World... from OMP thread = 5
50 | Hello World... from OMP thread = 2
51 | Hello World... from HIP thread = 0
52 | Hello World... from HIP thread = 2
53 | Hello World... from HIP thread = 5
54 | Hello World... from HIP thread = 14
55 | Hello World... from HIP thread = 6
56 | Hello World... from HIP thread = 12
57 | Hello World... from HIP thread = 7
58 | Hello World... from HIP thread = 9
59 | Hello World... from HIP thread = 1
60 | Hello World... from HIP thread = 11
61 | Hello World... from HIP thread = 10
62 | Hello World... from HIP thread = 4
63 | Hello World... from HIP thread = 8
64 | Hello World... from HIP thread = 13
65 | Hello World... from HIP thread = 15
66 | Hello World... from HIP thread = 3
67 | Device Results:
68 |   A_d[0] = 0
69 |   A_d[1] = 1
70 |   A_d[2] = 2
71 |   A_d[3] = 3
72 |   A_d[4] = 4
73 |   A_d[5] = 5
74 |   A_d[6] = 6
75 |   A_d[7] = 7
76 |   A_d[8] = 8
77 |   A_d[9] = 9
78 |   A_d[10] = 10
79 |   A_d[11] = 11
80 |   A_d[12] = 12
81 |   A_d[13] = 13
82 |   A_d[14] = 14
83 |   A_d[15] = 15
84 | PASSED!
85 | ```
86 | 
87 | **Note:** HIP thread's printf may not display on builds with printf support disabled.
88 | 


--------------------------------------------------------------------------------
/openmp-helloworld/openmp_helloworld.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | // OpenMP program to print Hello World
24 | // using C language is supported by HIP
25 | 
26 | // HIP header
27 | #include <hip/hip_runtime.h>
28 | 
29 | #include <stdio.h>
30 | #include <stdlib.h>
31 | 
32 | //OpenMP header
33 | #include <omp.h>
34 | 
35 | #define NUM_THREADS 16
36 | #define CHECK(cmd) \
37 | {\
38 |     hipError_t error  = cmd;\
39 |     if (error != hipSuccess) { \
40 |         fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
41 |         exit(EXIT_FAILURE);\
42 | 	  }\
43 | }
44 | 
45 | __global__ void
46 | hip_helloworld(unsigned omp_id, int* A_d)
47 | {
48 |     // Note: the printf command will only work if printf is enabled in your build.
49 |     printf("Hello World... from HIP thread = %u\n", omp_id);
50 | 
51 |     A_d[omp_id] = omp_id;
52 | }
53 | 
54 | int main(int argc, char* argv[])
55 | {
56 |     int* A_h, * A_d;
57 |     size_t Nbytes = NUM_THREADS * sizeof(int);
58 | 
59 |     hipDeviceProp_t props;
60 |     CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
61 |     printf("info: running on device %s\n", props.name);
62 | 
63 |     A_h = (int*)malloc(Nbytes);
64 |     CHECK(hipMalloc(&A_d, Nbytes));
65 |     for (int i = 0; i < NUM_THREADS; i++) {
66 |         A_h[i] = 0;
67 |     }
68 |     CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
69 | 
70 |     // Beginning of parallel region
71 |     #pragma omp parallel num_threads(NUM_THREADS)
72 |     {
73 |         fprintf(stderr, "Hello World... from OMP thread = %d\n",
74 |                omp_get_thread_num());
75 | 
76 |         hipLaunchKernelGGL(hip_helloworld, dim3(1), dim3(1), 0, 0, omp_get_thread_num(), A_d);
77 |     }
78 |     // Ending of parallel region
79 | 
80 |     hipStreamSynchronize(0);
81 |     CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
82 |     printf("Device Results:\n");
83 |     for (int i = 0; i < NUM_THREADS; i++) {
84 |         printf("  A_d[%d] = %d\n", i, A_h[i]);
85 |     }
86 | 
87 |     printf ("PASSED!\n");
88 | 
89 |     free(A_h);
90 |     CHECK(hipFree(A_d));
91 |     return 0;
92 | }
93 | 


--------------------------------------------------------------------------------
/reduction/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | CXXFLAGS += -std=c++11 -O3
 5 | 
 6 | reduction: reduction.cpp
 7 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
 8 | 	${HIPCC} ${CXXFLAGS} -o $@ $^ 
 9 | else
10 | 	$(error "Cannot find $(HIPCC), please install HIP toolkit")
11 | endif
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	rm -f reduction *.o
17 | 


--------------------------------------------------------------------------------
/reduction/README.md:
--------------------------------------------------------------------------------
1 | # reduction
2 | reduction example with atomic_add usig HIP.
3 | To build:
4 |     make
5 | To execute:
6 |     ./run.sh
7 | 


--------------------------------------------------------------------------------
/reduction/reduction.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | 
 23 | #include <cstdio>
 24 | #include <iostream>
 25 | #include <fstream>
 26 | #include <vector>
 27 | #include <chrono>
 28 | #include <cfloat>
 29 | #include <iomanip>
 30 | #include <cmath>
 31 | #include "hip/hip_runtime.h"
 32 | 
 33 | 
 34 | void check_hip_error(void)
 35 | {
 36 | hipError_t err = hipGetLastError();
 37 | if (err != hipSuccess)
 38 | {
 39 |     std::cerr
 40 |         << "Error: "
 41 |         << hipGetErrorString(err)
 42 |         << std::endl;
 43 |         exit(err);
 44 | }
 45 | }
 46 | 
 47 | __global__ void atomic_reduction_kernel(int *in, int* out, int ARRAYSIZE) {
 48 |     int sum=int(0);
 49 |     int idx = hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;
 50 |     for(int i= idx;i<ARRAYSIZE;i+=hipBlockDim_x*hipGridDim_x) {
 51 |         sum+=in[i];
 52 |     }
 53 |     atomicAdd(out,sum);
 54 | }
 55 | 
 56 | __global__ void atomic_reduction_kernel2(int *in, int* out, int ARRAYSIZE) {
 57 |     int sum=int(0);
 58 |     int idx = hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;
 59 |     for(int i= idx*16;i<ARRAYSIZE;i+=hipBlockDim_x*hipGridDim_x*16) {
 60 |         sum+=in[i] + in[i+1] + in[i+2] + in[i+3] +in[i+4] +in[i+5] +in[i+6] +in[i+7] +in[i+8] +in[i+9] +in[i+10]
 61 |             +in[i+11] +in[i+12] +in[i+13] +in[i+14] +in[i+15] ;
 62 |     }
 63 |     atomicAdd(out,sum);
 64 | }
 65 | 
 66 | __global__ void atomic_reduction_kernel3(int *in, int* out, int ARRAYSIZE) {
 67 |     int sum=int(0);
 68 |     int idx = hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;
 69 |     for(int i= idx*4;i<ARRAYSIZE;i+=hipBlockDim_x*hipGridDim_x*4) {
 70 |         sum+=in[i] + in[i+1] + in[i+2] + in[i+3];
 71 |     }
 72 |     atomicAdd(out,sum);
 73 | }
 74 | 
 75 | int main(int argc, char** argv)
 76 | {
 77 |     unsigned int ARRAYSIZE = 52428800;
 78 |     if(argc<2) {
 79 |         printf("Usage: ./reduction num_of_elems\n");
 80 |         printf("using default value: %d\n",ARRAYSIZE);
 81 |     }else
 82 |         ARRAYSIZE=atoi(argv[1]);
 83 |     int N = 10;
 84 |     printf("ARRAYSIZE: %d\n", ARRAYSIZE);
 85 | 
 86 |     std::cout << "Array size: " << ARRAYSIZE*sizeof(int)/1024.0/1024.0 << " MB"<<std::endl;
 87 |     int* array=(int*)malloc(ARRAYSIZE*sizeof(int));
 88 |     int checksum =0;
 89 |     for(int i=0;i<ARRAYSIZE;i++) {
 90 |         array[i]=rand()%2;
 91 |         checksum+=array[i];
 92 |     }
 93 |     int *in, *out;
 94 | 
 95 |     // Declare timers
 96 |     std::chrono::high_resolution_clock::time_point t1, t2;
 97 | 
 98 | 
 99 |     long long size=sizeof(int)*ARRAYSIZE;
100 | 
101 |     hipMalloc(&in,size);
102 |     hipMalloc(&out,sizeof(int));
103 |     check_hip_error();
104 | 
105 |     hipMemcpy(in,array,ARRAYSIZE*sizeof(int),hipMemcpyHostToDevice);
106 |     hipDeviceSynchronize();
107 |     check_hip_error();
108 |     // Get device properties
109 |     hipDeviceProp_t props;
110 |     hipGetDeviceProperties(&props, 0);
111 | 
112 | 
113 |     int threads=256;
114 |     int blocks=std::min((ARRAYSIZE+threads-1)/threads,2048u);
115 | 
116 |     t1 = std::chrono::high_resolution_clock::now();
117 |     for(int i=0;i<N;i++) {
118 |         hipMemsetAsync(out,0,sizeof(int));
119 |         hipLaunchKernelGGL(atomic_reduction_kernel, dim3(blocks), dim3(threads), 0, 0, in,out,ARRAYSIZE);
120 |         //hipLaunchKernelGGL(atomic_reduction_kernel2, dim3(blocks), dim3(threads), 0, 0, in,out,ARRAYSIZE);
121 |         //hipLaunchKernelGGL(atomic_reduction_kernel3, dim3(blocks), dim3(threads), 0, 0, in,out,ARRAYSIZE);
122 | 
123 |         check_hip_error();
124 |         hipDeviceSynchronize();
125 |         check_hip_error();
126 |     }
127 |     t2 = std::chrono::high_resolution_clock::now();
128 |     double times =  std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
129 |     float GB=(float)ARRAYSIZE*sizeof(int)*N;
130 |     std::cout
131 |         << "The average performance of reduction is "<< 1.0E-09 * GB/times<<" GBytes/sec"<<std::endl;
132 | 
133 |     int sum;
134 |     hipMemcpy(&sum,out,sizeof(int),hipMemcpyDeviceToHost);
135 |     check_hip_error();
136 | 
137 |     if(sum==checksum)
138 |         std::cout<<"VERIFICATION: result is CORRECT"<<std::endl<<std::endl;
139 |     else
140 |         std::cout<<"VERIFICATION: result is INCORRECT!!"<<std::endl<<std::endl;
141 | 
142 |     hipFree(in);
143 |     hipFree(out);
144 |     check_hip_error();
145 | 
146 |     free(array);
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/reduction/run.sh:
--------------------------------------------------------------------------------
 1 | #execute the program
 2 | EXE=reduction
 3 | K=1024*1024*4
 4 | for i in {1..8}
 5 | do
 6 |     echo ./$EXE $K
 7 |     ./$EXE $K
 8 |     K=$(($K*2))
 9 | done
10 | 
11 | 


--------------------------------------------------------------------------------
/rtm8/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | CXXFLAGS += -std=c++11 -O3
 5 | 
 6 | rtm8: rtm8.cpp
 7 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
 8 | 	${HIPCC} ${CXXFLAGS} -o $@ $^ 
 9 | else
10 | 	$(error "Cannot find $(HIPCC), please install HIP toolkit")
11 | endif
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	rm -f rtm8 *.o
17 | 


--------------------------------------------------------------------------------
/rtm8/README.md:
--------------------------------------------------------------------------------
 1 | rtm8 is an example ported from an Fortran algorithm contributed by Morton, Scott from HESS company.
 2 | The original Fortran version:
 3 |     ./build_fortran.sh
 4 |     ./rtm8_fortran
 5 | The HIP version:
 6 |     ./build_hip.sh
 7 |     ./rtm8_hip
 8 | The CUDA version:
 9 |     ./build_cuda.sh
10 |     ./rtm8_cuda
11 | 


--------------------------------------------------------------------------------
/rtm8/build_cuda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ -f "rtm8_cuda" ]
3 | then
4 |     rm rtm8_cuda
5 | fi
6 | echo "nvcc -O3 rtm8.cpp -o rtm8_cuda"
7 | nvcc -O3 rtm8.cu -o rtm8_cuda
8 | 


--------------------------------------------------------------------------------
/rtm8/build_fortran.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ -f "rtm8_fortran" ]
3 | then
4 |     rm rtm8_fortran
5 | fi
6 | gfortran  -c rtm8.f
7 | gcc -c -DUNDERSCORE mysecond.c
8 | gfortran  -o rtm8_fortran rtm8.o mysecond.o
9 | 


--------------------------------------------------------------------------------
/rtm8/build_hip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z  "$HIP_PATH" ]
 4 | then
 5 | 
 6 | if [ -d /opt/rocm/hip ]
 7 | then
 8 |     HIP_PATH=/opt/rocm/hip
 9 | else
10 |     HIP_PATH=/opt/rocm
11 | fi
12 | 
13 | fi
14 | 
15 | if [ -f "rtm8_hip" ]
16 | then
17 |     rm rtm8_hip
18 | fi
19 | 
20 | echo "hipcc -std=c++11 -O3 -o rtm8_hip rtm8.cpp"
21 | $HIP_PATH/bin/hipcc -std=c++11 -O3 -o rtm8_hip rtm8.cpp
22 | 
23 | 


--------------------------------------------------------------------------------
/rtm8/mysecond.c:
--------------------------------------------------------------------------------
 1 | /* A gettimeofday routine to give access to the wall
 2 |    clock timer on most UNIX-like systems.
 3 | 
 4 |    You will need to compile with "-DUNDERSCORE"
 5 |    to get this to link with FORTRAN on many systems.
 6 | */
 7 | 
 8 | #include <sys/time.h>
 9 | /* int gettimeofday(struct timeval *tp, struct timezone *tzp); */
10 | 
11 | #ifdef UNDERSCORE
12 | double mysecond_()
13 | #else
14 | double mysecond()
15 | #endif
16 | {
17 | /* struct timeval { long        tv_sec;
18 |             long        tv_usec;        };
19 | 
20 | struct timezone { int   tz_minuteswest;
21 |              int        tz_dsttime;      };     */
22 | 
23 |         struct timeval tp;
24 |         struct timezone tzp;
25 |         int i;
26 | 
27 |         i = gettimeofday(&tp,&tzp);
28 |         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/rtm8/rtm8.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | 
 23 | #include "hip/hip_runtime.h"
 24 | #include <iostream>
 25 | #include <math.h>
 26 | #include <stdio.h>
 27 | #include <stdlib.h>
 28 | #include <vector>
 29 | #include "mysecond.c"
 30 | 
 31 | #define nt 30
 32 | #define nx 680
 33 | #define ny 134
 34 | #define nz 450
 35 | 
 36 | inline __host__ __device__ int indexTo1D(int x, int y, int z){
 37 |     return x + y*ny + z*ny*nz;
 38 | }
 39 | 
 40 | __global__ void
 41 | rtm8(float* vsq, float* current_s, float* current_r, float* next_s, float* next_r, float* image, float* a, size_t N)
 42 | {
 43 |     unsigned x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
 44 |     unsigned y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
 45 |     unsigned z = hipBlockIdx_z*hipBlockDim_z + hipThreadIdx_z;
 46 |     float div;
 47 |     if ((4 <= x && x < (nx - 4) ) && (4 <= y && y < (ny - 4)) && (4 <= z && z < (nz - 4))){
 48 |         div =
 49 |             a[0] * current_s[indexTo1D(x,y,z)] +
 50 |             a[1] * (current_s[indexTo1D(x+1,y,z)] + current_s[indexTo1D(x-1,y,z)] +
 51 |                     current_s[indexTo1D(x,y+1,z)] + current_s[indexTo1D(x,y-1,z)] +
 52 |                     current_s[indexTo1D(x,y,z+1)] + current_s[indexTo1D(x,y,z-1)]) +
 53 |             a[2] * (current_s[indexTo1D(x+2,y,z)] + current_s[indexTo1D(x-2,y,z)] +
 54 |                     current_s[indexTo1D(x,y+2,z)] + current_s[indexTo1D(x,y-2,z)] +
 55 |                     current_s[indexTo1D(x,y,z+2)] + current_s[indexTo1D(x,y,z-2)]) +
 56 |             a[3] * (current_s[indexTo1D(x+3,y,z)] + current_s[indexTo1D(x-3,y,z)] +
 57 |                     current_s[indexTo1D(x,y+3,z)] + current_s[indexTo1D(x,y-3,z)] +
 58 |                     current_s[indexTo1D(x,y,z+3)] + current_s[indexTo1D(x,y,z-3)]) +
 59 |             a[4] * (current_s[indexTo1D(x+4,y,z)] + current_s[indexTo1D(x-4,y,z)] +
 60 |                     current_s[indexTo1D(x,y+4,z)] + current_s[indexTo1D(x,y-4,z)] +
 61 |                     current_s[indexTo1D(x,y,z+4)] + current_s[indexTo1D(x,y,z-4)]);
 62 | 
 63 |         next_s[indexTo1D(x,y,z)] = 2*current_s[indexTo1D(x,y,z)] - next_s[indexTo1D(x,y,z)]
 64 |             + vsq[indexTo1D(x,y,z)]*div;
 65 |         div =
 66 |             a[0] * current_r[indexTo1D(x,y,z)] +
 67 |             a[1] * (current_r[indexTo1D(x+1,y,z)] + current_r[indexTo1D(x-1,y,z)] +
 68 |                     current_r[indexTo1D(x,y+1,z)] + current_r[indexTo1D(x,y-1,z)] +
 69 |                     current_r[indexTo1D(x,y,z+1)] + current_r[indexTo1D(x,y,z-1)]) +
 70 |             a[2] * (current_r[indexTo1D(x+2,y,z)] + current_r[indexTo1D(x-2,y,z)] +
 71 |                     current_r[indexTo1D(x,y+2,z)] + current_r[indexTo1D(x,y-2,z)] +
 72 |                     current_r[indexTo1D(x,y,z+2)] + current_r[indexTo1D(x,y,z-2)]) +
 73 |             a[3] * (current_r[indexTo1D(x+3,y,z)] + current_r[indexTo1D(x-3,y,z)] +
 74 |                     current_r[indexTo1D(x,y+3,z)] + current_r[indexTo1D(x,y-3,z)] +
 75 |                     current_r[indexTo1D(x,y,z+3)] + current_r[indexTo1D(x,y,z-3)]) +
 76 |             a[4] * (current_r[indexTo1D(x+4,y,z)] + current_r[indexTo1D(x-4,y,z)] +
 77 |                     current_r[indexTo1D(x,y+4,z)] + current_r[indexTo1D(x,y-4,z)] +
 78 |                     current_r[indexTo1D(x,y,z+4)] + current_r[indexTo1D(x,y,z-4)]);
 79 | 
 80 |         next_r[indexTo1D(x,y,z)] = 2 * current_r[indexTo1D(x,y,z)]
 81 |             - next_r[indexTo1D(x,y,z)] + vsq[indexTo1D(x,y,z)] * div;
 82 | 
 83 |         image[indexTo1D(x,y,z)] = next_s[indexTo1D(x,y,z)] * next_r[indexTo1D(x,y,z)];
 84 |     }
 85 | }
 86 | 
 87 | // Code to check HIP errors
 88 | void check_hip_error(void)
 89 | {
 90 |     hipError_t err = hipGetLastError();
 91 |     if (err != hipSuccess)
 92 |     {
 93 |         std::cerr
 94 |             << "Error: "
 95 |             << hipGetErrorString(err)
 96 |             << std::endl;
 97 |             exit(err);
 98 |     }
 99 | }
100 | 
101 | 
102 | int main(){
103 |     const int ArraySize = nx + nx*ny + nx*ny*nz;
104 | 
105 |     float* next_s = (float*)malloc(ArraySize * sizeof(float));
106 |     float* current_s = (float*)malloc(ArraySize * sizeof(float));
107 |     float* next_r = (float*)malloc(ArraySize * sizeof(float));
108 |     float* current_r = (float*)malloc(ArraySize * sizeof(float));
109 |     float* vsq = (float*)malloc(ArraySize * sizeof(float));
110 |     float* image = (float*)malloc(ArraySize * sizeof(float));
111 | 
112 |     float a[5];
113 | 
114 |     double pts, t0, t1, dt, flops, pt_rate, flop_rate, speedup, memory;
115 | 
116 |     memory = nx*ny*nz*4*6;
117 |     pts = nt;
118 | 	pts = pts*(nx-8)*(ny-8)*(nz-8);
119 | 	flops = 67*pts;
120 |     printf("memory (MB) = %f\n", memory/1e6);
121 |     printf("pts (billions) = %f\n", pts/1e9);
122 |     printf("Tflops = %f\n", flops/1e12);
123 | 
124 | // Initialization of matrix
125 | 	a[0] = -1./560.;
126 | 	a[1] = 8./315;
127 | 	a[2] = -0.2;
128 | 	a[3] = 1.6;
129 | 	a[4] = -1435./504.;
130 | 
131 |     for (int z = 0; z < nz; z++) {
132 |         for (int y = 0; y < ny; y++) {
133 |             for (int x = 0; x < nx; x++) {
134 |                 vsq[indexTo1D(x,y,z)] = 1.0;
135 |                 next_s[indexTo1D(x,y,z)] = 0;
136 |                 current_s[indexTo1D(x,y,z)] = 0;
137 |                 next_r[indexTo1D(x,y,z)] = 0;
138 |                 current_r[indexTo1D(x,y,z)] = 0;
139 |                 image[indexTo1D(x,y,z)] = 0;
140 |             }
141 |         }
142 |     }
143 | 
144 |     t0 = mysecond();
145 |     //allocate and copy matrix to device
146 |     float* vsq_d;
147 |     float* next_s_d;
148 |     float* current_s_d;
149 |     float* next_r_d;
150 |     float* current_r_d;
151 |     float* image_d;
152 |     float* a_d;
153 | 
154 | 	hipMalloc(&vsq_d, ArraySize * sizeof(float));
155 | 	hipMalloc(&next_s_d, ArraySize * sizeof(float));
156 | 	hipMalloc(&current_s_d, ArraySize * sizeof(float));
157 | 	hipMalloc(&next_r_d, ArraySize * sizeof(float));
158 | 	hipMalloc(&current_r_d, ArraySize * sizeof(float));
159 | 	hipMalloc(&image_d, ArraySize * sizeof(float));
160 | 	hipMalloc(&a_d, 5 * sizeof(float));
161 |     check_hip_error();
162 |     hipMemcpy(vsq_d, vsq, ArraySize * sizeof(float), hipMemcpyHostToDevice);
163 |     hipMemcpy(next_s_d, next_s, ArraySize * sizeof(float), hipMemcpyHostToDevice);
164 |     hipMemcpy(current_s_d, current_s, ArraySize * sizeof(float), hipMemcpyHostToDevice);
165 |     hipMemcpy(next_r_d, next_r, ArraySize * sizeof(float), hipMemcpyHostToDevice);
166 |     hipMemcpy(current_r_d, current_r, ArraySize * sizeof(float), hipMemcpyHostToDevice);
167 |     hipMemcpy(image_d, image, ArraySize * sizeof(float), hipMemcpyHostToDevice);
168 |     hipMemcpy(a_d, a, 5 * sizeof(float), hipMemcpyHostToDevice);
169 |     check_hip_error();
170 |     // Make sure the copies are finished
171 |     hipDeviceSynchronize();
172 |     check_hip_error();
173 | 
174 |     int gridSize = 256*256;
175 |     int groupSize = 256;
176 | 
177 | 
178 |     for (int t = 0; t < nt; t++) {
179 |         //Launch the HIP kernel
180 |         hipLaunchKernelGGL(rtm8, dim3(gridSize), dim3(groupSize), 0, 0, (float*)vsq_d, (float*)current_s_d, (
181 |                     float*)next_s_d, (float*)current_r_d,(float*)next_r_d, (float*)image_d, (float*)a_d, ArraySize);
182 |     }
183 |     //copy back image value
184 |     hipMemcpy(image, image_d,ArraySize * sizeof(float), hipMemcpyDeviceToHost);
185 |     hipDeviceSynchronize();
186 |     t1 = mysecond();
187 | 
188 |     dt = t1 - t0;
189 |     pt_rate = pts/dt;
190 |     flop_rate = flops/dt;
191 |     speedup = 2*pow(10, 9)/3/pt_rate;
192 |     printf("dt = %f\n", dt);
193 |     printf("pt_rate (millions/sec) = %f\n", pt_rate/1e6);
194 |     printf("flop_rate (Gflops) = %f\n", flop_rate/1e9);
195 |     printf("speedup = %f\n", speedup);
196 | 
197 |     //release arrays
198 |     free(vsq);
199 |     free(next_s);
200 |     free(current_s);
201 |     free(next_r);
202 |     free(current_r);
203 |     free(image);
204 |     return 0;
205 | 
206 | }
207 | 
208 | 


--------------------------------------------------------------------------------
/rtm8/rtm8.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | 
 23 | #include <cuda.h>
 24 | #include <cuda_runtime.h>
 25 | #include <iostream>
 26 | #include <math.h>
 27 | #include <stdio.h>
 28 | #include <stdlib.h>
 29 | #include <vector>
 30 | #include "mysecond.c"
 31 | 
 32 | #define nt 30
 33 | #define nx 680
 34 | #define ny 134
 35 | #define nz 450
 36 | 
 37 | inline __host__ __device__ int indexTo1D(int x, int y, int z){
 38 |     return x + y*ny + z*ny*nz;
 39 | }
 40 | 
 41 | __global__ void
 42 | rtm8(float* vsq, float* current_s, float* current_r, float* next_s, float* next_r, float* image, float* a, size_t N)
 43 | {
 44 |     unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
 45 |     unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
 46 |     unsigned z = blockIdx.z * blockDim.z + threadIdx.z;
 47 |     float div;
 48 |     if ((4 <= x && x < (nx - 4) ) && (4 <= y && y < (ny - 4)) && (4 <= z && z < (nz - 4))){
 49 |         div =
 50 |             a[0] * current_s[indexTo1D(x,y,z)] +
 51 |             a[1] * (current_s[indexTo1D(x+1,y,z)] + current_s[indexTo1D(x-1,y,z)] +
 52 |                     current_s[indexTo1D(x,y+1,z)] + current_s[indexTo1D(x,y-1,z)] +
 53 |                     current_s[indexTo1D(x,y,z+1)] + current_s[indexTo1D(x,y,z-1)]) +
 54 |             a[2] * (current_s[indexTo1D(x+2,y,z)] + current_s[indexTo1D(x-2,y,z)] +
 55 |                     current_s[indexTo1D(x,y+2,z)] + current_s[indexTo1D(x,y-2,z)] +
 56 |                     current_s[indexTo1D(x,y,z+2)] + current_s[indexTo1D(x,y,z-2)]) +
 57 |             a[3] * (current_s[indexTo1D(x+3,y,z)] + current_s[indexTo1D(x-3,y,z)] +
 58 |                     current_s[indexTo1D(x,y+3,z)] + current_s[indexTo1D(x,y-3,z)] +
 59 |                     current_s[indexTo1D(x,y,z+3)] + current_s[indexTo1D(x,y,z-3)]) +
 60 |             a[4] * (current_s[indexTo1D(x+4,y,z)] + current_s[indexTo1D(x-4,y,z)] +
 61 |                     current_s[indexTo1D(x,y+4,z)] + current_s[indexTo1D(x,y-4,z)] +
 62 |                     current_s[indexTo1D(x,y,z+4)] + current_s[indexTo1D(x,y,z-4)]);
 63 | 
 64 |         next_s[indexTo1D(x,y,z)] = 2*current_s[indexTo1D(x,y,z)] - next_s[indexTo1D(x,y,z)]
 65 |             + vsq[indexTo1D(x,y,z)]*div;
 66 |         div =
 67 |             a[0] * current_r[indexTo1D(x,y,z)] +
 68 |             a[1] * (current_r[indexTo1D(x+1,y,z)] + current_r[indexTo1D(x-1,y,z)] +
 69 |                     current_r[indexTo1D(x,y+1,z)] + current_r[indexTo1D(x,y-1,z)] +
 70 |                     current_r[indexTo1D(x,y,z+1)] + current_r[indexTo1D(x,y,z-1)]) +
 71 |             a[2] * (current_r[indexTo1D(x+2,y,z)] + current_r[indexTo1D(x-2,y,z)] +
 72 |                     current_r[indexTo1D(x,y+2,z)] + current_r[indexTo1D(x,y-2,z)] +
 73 |                     current_r[indexTo1D(x,y,z+2)] + current_r[indexTo1D(x,y,z-2)]) +
 74 |             a[3] * (current_r[indexTo1D(x+3,y,z)] + current_r[indexTo1D(x-3,y,z)] +
 75 |                     current_r[indexTo1D(x,y+3,z)] + current_r[indexTo1D(x,y-3,z)] +
 76 |                     current_r[indexTo1D(x,y,z+3)] + current_r[indexTo1D(x,y,z-3)]) +
 77 |             a[4] * (current_r[indexTo1D(x+4,y,z)] + current_r[indexTo1D(x-4,y,z)] +
 78 |                     current_r[indexTo1D(x,y+4,z)] + current_r[indexTo1D(x,y-4,z)] +
 79 |                     current_r[indexTo1D(x,y,z+4)] + current_r[indexTo1D(x,y,z-4)]);
 80 | 
 81 |         next_r[indexTo1D(x,y,z)] = 2 * current_r[indexTo1D(x,y,z)]
 82 |             - next_r[indexTo1D(x,y,z)] + vsq[indexTo1D(x,y,z)] * div;
 83 | 
 84 |         image[indexTo1D(x,y,z)] = next_s[indexTo1D(x,y,z)] * next_r[indexTo1D(x,y,z)];
 85 |     }
 86 | }
 87 | 
 88 | // Code to check CUDA errors
 89 | void check_cuda_error(void)
 90 | {
 91 |     cudaError_t err = cudaGetLastError();
 92 |     if (err != cudaSuccess)
 93 |     {
 94 |         std::cerr
 95 |             << "Error: "
 96 |             << cudaGetErrorString(err)
 97 |             << std::endl;
 98 |             exit(err);
 99 |     }
100 | }
101 | 
102 | int main(){
103 |     const int ArraySize = nx + nx*ny + nx*ny*nz;
104 | 
105 |     float* next_s = (float*)malloc(ArraySize * sizeof(float));
106 |     float* current_s = (float*)malloc(ArraySize * sizeof(float));
107 |     float* next_r = (float*)malloc(ArraySize * sizeof(float));
108 |     float* current_r = (float*)malloc(ArraySize * sizeof(float));
109 |     float* vsq = (float*)malloc(ArraySize * sizeof(float));
110 |     float* image = (float*)malloc(ArraySize * sizeof(float));
111 | 
112 |     float a[5];
113 | 
114 |     double pts, t0, t1, dt, flops, pt_rate, flop_rate, speedup, memory;
115 | 
116 |     memory = nx*ny*nz*4*6;
117 |     pts = nt;
118 | 	pts = pts*(nx-8)*(ny-8)*(nz-8);
119 | 	flops = 67*pts;
120 |     printf("memory (MB) = %f\n", memory/1e6);
121 |     printf("pts (billions) = %f\n", pts/1e9);
122 |     printf("Tflops = %f\n", flops/1e12);
123 | 
124 | // Initialization of matrix
125 | 	a[0] = -1./560.;
126 | 	a[1] = 8./315;
127 | 	a[2] = -0.2;
128 | 	a[3] = 1.6;
129 | 	a[4] = -1435./504.;
130 | 
131 |     for (int z = 0; z < nz; z++) {
132 |         for (int y = 0; y < ny; y++) {
133 |             for (int x = 0; x < nx; x++) {
134 |                 vsq[indexTo1D(x,y,z)] = 1.0;
135 |                 next_s[indexTo1D(x,y,z)] = 0;
136 |                 current_s[indexTo1D(x,y,z)] = 0;
137 |                 next_r[indexTo1D(x,y,z)] = 0;
138 |                 current_r[indexTo1D(x,y,z)] = 0;
139 |                 image[indexTo1D(x,y,z)] = 0;
140 |             }
141 |         }
142 |     }
143 | 
144 |     t0 = mysecond();
145 |     //allocate and copy matrix to device
146 |     float* vsq_d;
147 |     float* next_s_d;
148 |     float* current_s_d;
149 |     float* next_r_d;
150 |     float* current_r_d;
151 |     float* image_d;
152 |     float* a_d;
153 | 
154 | 	cudaMalloc(&vsq_d, ArraySize * sizeof(float));
155 | 	cudaMalloc(&next_s_d, ArraySize * sizeof(float));
156 | 	cudaMalloc(&current_s_d, ArraySize * sizeof(float));
157 | 	cudaMalloc(&next_r_d, ArraySize * sizeof(float));
158 | 	cudaMalloc(&current_r_d, ArraySize * sizeof(float));
159 | 	cudaMalloc(&image_d, ArraySize * sizeof(float));
160 | 	cudaMalloc(&a_d, 5 * sizeof(float));
161 |     check_cuda_error();
162 |     cudaMemcpy(vsq_d, vsq, ArraySize * sizeof(float), cudaMemcpyHostToDevice);
163 |     cudaMemcpy(next_s_d, next_s, ArraySize * sizeof(float), cudaMemcpyHostToDevice);
164 |     cudaMemcpy(current_s_d, current_s, ArraySize * sizeof(float), cudaMemcpyHostToDevice);
165 |     cudaMemcpy(next_r_d, next_r, ArraySize * sizeof(float), cudaMemcpyHostToDevice);
166 |     cudaMemcpy(current_r_d, current_r, ArraySize * sizeof(float), cudaMemcpyHostToDevice);
167 |     cudaMemcpy(image_d, image, ArraySize * sizeof(float), cudaMemcpyHostToDevice);
168 |     cudaMemcpy(a_d, a, 5 * sizeof(float), cudaMemcpyHostToDevice);
169 |     check_cuda_error();
170 |     // Make sure the copies are finished
171 |     cudaDeviceSynchronize();
172 |     check_cuda_error();
173 | 
174 |     int gridSize = 256*256;
175 |     int groupSize = 256;
176 | 
177 | 
178 |     for (int t = 0; t < nt; t++) {
179 |         //Launch the HIP kernel
180 |         rtm8<<<dim3(gridSize), dim3(groupSize)>>>((float*)vsq_d, (float*)current_s_d, (float*)next_s_d, (float*)current_r_d,(float*)next_r_d, (float*)image_d, (float*)a_d, ArraySize);
181 |     }
182 |     //copy back image value
183 |     cudaMemcpy(image, image_d,ArraySize * sizeof(float), cudaMemcpyDeviceToHost);
184 |     cudaDeviceSynchronize();
185 |     t1 = mysecond();
186 | 
187 |     dt = t1 - t0;
188 |     pt_rate = pts/dt;
189 |     flop_rate = flops/dt;
190 |     speedup = 2*pow(10, 9)/3/pt_rate;
191 |     printf("dt = %f\n", dt);
192 |     printf("pt_rate (millions/sec) = %f\n", pt_rate/1e6);
193 |     printf("flop_rate (Gflops) = %f\n", flop_rate/1e9);
194 |     printf("speedup = %f\n", speedup);
195 | 
196 |     //release arrays
197 |     free(vsq);
198 |     free(next_s);
199 |     free(current_s);
200 |     free(next_r);
201 |     free(current_r);
202 |     free(image);
203 |     return 0;
204 | }
205 | 
206 | 


--------------------------------------------------------------------------------
/rtm8/rtm8.f:
--------------------------------------------------------------------------------
  1 | 	program rtm8
  2 | 	implicit none
  3 | 	integer	n, nt, nx, ny, nz
  4 | c	parameter( nt=100, nx=400, ny=400, nz=300 )
  5 | c	parameter( nt=100, nx=400, ny=100, nz=300 )
  6 | 	parameter( nt=30, nx=680, ny=134, nz=450 )
  7 | 	real	next_s(nx,ny,nz), current_s(nx,ny,nz)
  8 | 	real	next_r(nx,ny,nz), current_r(nx,ny,nz)
  9 | 	real	vsq(nx,ny,nz), image(nx,ny,nz)
 10 | 	real	a(5)
 11 | 	external	mysecond
 12 | 	real*8		mysecond
 13 | c
 14 | 	integer	t, x, y, z
 15 | 	real*8	pts, t0, t1, dt, flops, pt_rate, flop_rate, speedup, memory
 16 | 	real	div
 17 | c
 18 | 	memory = nx*ny*nz*4*6
 19 | 	pts = nt
 20 | 	pts = pts*(nx-8)*(ny-8)*(nz-8)
 21 | 	flops = 67.*pts
 22 | 	print *, 'memory (MB) = ', memory/1e6
 23 | 	print *, 'pts (billions) = ', pts/1e9
 24 | 	print *, 'Tflops = ', flops/1e12
 25 | c
 26 | 	a(1) = -1./560.
 27 | 	a(2) = 8./315
 28 | 	a(3) = -0.2
 29 | 	a(4) = 1.6
 30 | 	a(5) = -1435./504.
 31 | c
 32 | !$omp parallel
 33 | !$omp do
 34 | 	do z = 1, nz
 35 | 	do y = 1, ny
 36 | 	do x = 1, nx
 37 | 		vsq(x,y,z) = 1
 38 | 		next_s(x,y,z) = 0
 39 | 		current_s(x,y,z) = 0
 40 | 		next_r(x,y,z) = 0
 41 | 		current_r(x,y,z) = 0
 42 | 		image(x,y,z) = 0
 43 | 	enddo
 44 | 	enddo
 45 | 	enddo
 46 | !$omp enddo
 47 | !$omp end parallel
 48 | c
 49 | 	t0 = mysecond()
 50 | 	do t = 1, nt
 51 | 	do z = 5, nz-4
 52 | 	do y = 5, ny-4
 53 | 	do x = 5, nx-4
 54 |      		div =
 55 |      &		a(1)*	current_s(x,y,z) +
 56 |      &		a(2)*(	current_s(x+1,y,z) + current_s(x-1,y,z) +
 57 |      &			current_s(x,y+1,z) + current_s(x,y-1,z) +
 58 |      &			current_s(x,y,z+1) + current_s(x,y,z-1) ) +
 59 |      &		a(3)*(	current_s(x+2,y,z) + current_s(x-2,y,z) +
 60 |      &			current_s(x,y+2,z) + current_s(x,y-2,z) +
 61 |      &			current_s(x,y,z+2) + current_s(x,y,z-2) ) +
 62 |      &		a(4)*(	current_s(x+3,y,z) + current_s(x-3,y,z) +
 63 |      &			current_s(x,y+3,z) + current_s(x,y-3,z) +
 64 |      &			current_s(x,y,z+3) + current_s(x,y,z-3) ) +
 65 |      &		a(5)*(	current_s(x+4,y,z) + current_s(x-4,y,z) +
 66 |      &			current_s(x,y+4,z) + current_s(x,y-4,z) +
 67 |      &			current_s(x,y,z+4) + current_s(x,y,z-4) )
 68 |      		next_s(x,y,z) = 2.*current_s(x,y,z)
 69 |      &				- next_s(x,y,z) + vsq(x,y,z)* div
 70 |      		div =
 71 |      &		a(1)*	current_r(x,y,z) +
 72 |      &		a(2)*(	current_r(x+1,y,z) + current_r(x-1,y,z) +
 73 |      &			current_r(x,y+1,z) + current_r(x,y-1,z) +
 74 |      &			current_r(x,y,z+1) + current_r(x,y,z-1) ) +
 75 |      &		a(3)*(	current_r(x+2,y,z) + current_r(x-2,y,z) +
 76 |      &			current_r(x,y+2,z) + current_r(x,y-2,z) +
 77 |      &			current_r(x,y,z+2) + current_r(x,y,z-2) ) +
 78 |      &		a(4)*(	current_r(x+3,y,z) + current_r(x-3,y,z) +
 79 |      &			current_r(x,y+3,z) + current_r(x,y-3,z) +
 80 |      &			current_r(x,y,z+3) + current_r(x,y,z-3) ) +
 81 |      &		a(5)*(	current_r(x+4,y,z) + current_r(x-4,y,z) +
 82 |      &			current_r(x,y+4,z) + current_r(x,y-4,z) +
 83 |      &			current_r(x,y,z+4) + current_r(x,y,z-4) )
 84 |      		next_r(x,y,z) = 2.*current_r(x,y,z)
 85 |      &				- next_r(x,y,z) + vsq(x,y,z)* div
 86 | 		image(x,y,z) = next_s(x,y,z) * next_r(x,y,z)
 87 | 	enddo
 88 | 	enddo
 89 | 	enddo
 90 | 	enddo
 91 | 	t1 = mysecond()
 92 | c
 93 | 	dt = t1 - t0
 94 | 	pt_rate = pts/dt
 95 | 	flop_rate = flops/dt
 96 | 	speedup = 2.*10**9/3./pt_rate
 97 | 	print *, 'dt  = ', dt
 98 | 	print *, 'pt_rate (millions/sec) = ', pt_rate/1e6
 99 | 	print *, 'flop_rate (Gflops) = ', flop_rate/1e9
100 | 	print *, 'speedup = ', speedup
101 | c
102 | 	stop
103 | 	end
104 | 


--------------------------------------------------------------------------------
/strided-access/CL/cl_d3d10.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************
  2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and/or associated documentation files (the
  6 |  * "Materials"), to deal in the Materials without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
  9 |  * permit persons to whom the Materials are furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included
 13 |  * in all copies or substantial portions of the Materials.
 14 |  *
 15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 22 |  **********************************************************************************/
 23 | 
 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 25 | 
 26 | #ifndef __OPENCL_CL_D3D10_H
 27 | #define __OPENCL_CL_D3D10_H
 28 | 
 29 | #include <d3d10.h>
 30 | #include <CL/cl.h>
 31 | #include <CL/cl_platform.h>
 32 | 
 33 | #ifdef __cplusplus
 34 | extern "C" {
 35 | #endif
 36 | 
 37 | /******************************************************************************
 38 |  * cl_khr_d3d10_sharing                                                       */
 39 | #define cl_khr_d3d10_sharing 1
 40 | 
 41 | typedef cl_uint cl_d3d10_device_source_khr;
 42 | typedef cl_uint cl_d3d10_device_set_khr;
 43 | 
 44 | /******************************************************************************/
 45 | 
 46 | // Error Codes
 47 | #define CL_INVALID_D3D10_DEVICE_KHR                  -1002
 48 | #define CL_INVALID_D3D10_RESOURCE_KHR                -1003
 49 | #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
 50 | #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
 51 | 
 52 | // cl_d3d10_device_source_nv
 53 | #define CL_D3D10_DEVICE_KHR                          0x4010
 54 | #define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
 55 | 
 56 | // cl_d3d10_device_set_nv
 57 | #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
 58 | #define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
 59 | 
 60 | // cl_context_info
 61 | #define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
 62 | #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
 63 | 
 64 | // cl_mem_info
 65 | #define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
 66 | 
 67 | // cl_image_info
 68 | #define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
 69 | 
 70 | // cl_command_type
 71 | #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
 72 | #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
 73 | 
 74 | /******************************************************************************/
 75 | 
 76 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
 77 |     cl_platform_id             platform,
 78 |     cl_d3d10_device_source_khr d3d_device_source,
 79 |     void *                     d3d_object,
 80 |     cl_d3d10_device_set_khr    d3d_device_set,
 81 |     cl_uint                    num_entries,
 82 |     cl_device_id *             devices,
 83 |     cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
 84 | 
 85 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
 86 |     cl_context     context,
 87 |     cl_mem_flags   flags,
 88 |     ID3D10Buffer * resource,
 89 |     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 90 | 
 91 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
 92 |     cl_context        context,
 93 |     cl_mem_flags      flags,
 94 |     ID3D10Texture2D * resource,
 95 |     UINT              subresource,
 96 |     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 97 | 
 98 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
 99 |     cl_context        context,
100 |     cl_mem_flags      flags,
101 |     ID3D10Texture3D * resource,
102 |     UINT              subresource,
103 |     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
104 | 
105 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
106 |     cl_command_queue command_queue,
107 |     cl_uint          num_objects,
108 |     const cl_mem *   mem_objects,
109 |     cl_uint          num_events_in_wait_list,
110 |     const cl_event * event_wait_list,
111 |     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
112 | 
113 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
114 |     cl_command_queue command_queue,
115 |     cl_uint          num_objects,
116 |     cl_mem *         mem_objects,
117 |     cl_uint          num_events_in_wait_list,
118 |     const cl_event * event_wait_list,
119 |     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
120 | 
121 | #ifdef __cplusplus
122 | }
123 | #endif
124 | 
125 | #endif  // __OPENCL_CL_D3D10_H
126 | 
127 | 


--------------------------------------------------------------------------------
/strided-access/CL/cl_gl.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************
  2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and/or associated documentation files (the
  6 |  * "Materials"), to deal in the Materials without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
  9 |  * permit persons to whom the Materials are furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included
 13 |  * in all copies or substantial portions of the Materials.
 14 |  *
 15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 22 |  **********************************************************************************/
 23 | 
 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 25 | 
 26 | /*
 27 |  * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
 28 |  * OpenGL dependencies. The application is responsible for #including
 29 |  * OpenGL or OpenGL ES headers before #including cl_gl.h.
 30 |  */
 31 | 
 32 | #ifndef __OPENCL_CL_GL_H
 33 | #define __OPENCL_CL_GL_H
 34 | 
 35 | #ifdef __APPLE__
 36 | #include <OpenCL/cl.h>
 37 | #include <OpenGL/CGLDevice.h>
 38 | #else
 39 | #include <CL/cl.h>
 40 | #endif	
 41 | 
 42 | #ifdef __cplusplus
 43 | extern "C" {
 44 | #endif
 45 | 
 46 | typedef cl_uint     cl_gl_object_type;
 47 | typedef cl_uint     cl_gl_texture_info;
 48 | typedef cl_uint     cl_gl_platform_info;
 49 | typedef struct __GLsync *cl_GLsync;
 50 | 
 51 | /* cl_gl_object_type */
 52 | #define CL_GL_OBJECT_BUFFER             0x2000
 53 | #define CL_GL_OBJECT_TEXTURE2D          0x2001
 54 | #define CL_GL_OBJECT_TEXTURE3D          0x2002
 55 | #define CL_GL_OBJECT_RENDERBUFFER       0x2003
 56 | 
 57 | /* cl_gl_texture_info */
 58 | #define CL_GL_TEXTURE_TARGET            0x2004
 59 | #define CL_GL_MIPMAP_LEVEL              0x2005
 60 | 
 61 | extern CL_API_ENTRY cl_mem CL_API_CALL
 62 | clCreateFromGLBuffer(cl_context     /* context */,
 63 |                      cl_mem_flags   /* flags */,
 64 |                      cl_GLuint      /* bufobj */,
 65 |                      int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 66 | 
 67 | extern CL_API_ENTRY cl_mem CL_API_CALL
 68 | clCreateFromGLTexture2D(cl_context      /* context */,
 69 |                         cl_mem_flags    /* flags */,
 70 |                         cl_GLenum       /* target */,
 71 |                         cl_GLint        /* miplevel */,
 72 |                         cl_GLuint       /* texture */,
 73 |                         cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 74 | 
 75 | extern CL_API_ENTRY cl_mem CL_API_CALL
 76 | clCreateFromGLTexture3D(cl_context      /* context */,
 77 |                         cl_mem_flags    /* flags */,
 78 |                         cl_GLenum       /* target */,
 79 |                         cl_GLint        /* miplevel */,
 80 |                         cl_GLuint       /* texture */,
 81 |                         cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 82 | 
 83 | extern CL_API_ENTRY cl_mem CL_API_CALL
 84 | clCreateFromGLRenderbuffer(cl_context   /* context */,
 85 |                            cl_mem_flags /* flags */,
 86 |                            cl_GLuint    /* renderbuffer */,
 87 |                            cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 88 | 
 89 | extern CL_API_ENTRY cl_int CL_API_CALL
 90 | clGetGLObjectInfo(cl_mem                /* memobj */,
 91 |                   cl_gl_object_type *   /* gl_object_type */,
 92 |                   cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
 93 |                   
 94 | extern CL_API_ENTRY cl_int CL_API_CALL
 95 | clGetGLTextureInfo(cl_mem               /* memobj */,
 96 |                    cl_gl_texture_info   /* param_name */,
 97 |                    size_t               /* param_value_size */,
 98 |                    void *               /* param_value */,
 99 |                    size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
100 | 
101 | extern CL_API_ENTRY cl_int CL_API_CALL
102 | clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
103 |                           cl_uint               /* num_objects */,
104 |                           const cl_mem *        /* mem_objects */,
105 |                           cl_uint               /* num_events_in_wait_list */,
106 |                           const cl_event *      /* event_wait_list */,
107 |                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
108 | 
109 | extern CL_API_ENTRY cl_int CL_API_CALL
110 | clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
111 |                           cl_uint               /* num_objects */,
112 |                           const cl_mem *        /* mem_objects */,
113 |                           cl_uint               /* num_events_in_wait_list */,
114 |                           const cl_event *      /* event_wait_list */,
115 |                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
116 | 
117 | /* cl_khr_gl_sharing extension  */
118 | 
119 | #define cl_khr_gl_sharing 1
120 | 
121 | typedef cl_uint     cl_gl_context_info;
122 | 
123 | /* Additional Error Codes  */
124 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
125 | 
126 | /* cl_gl_context_info  */
127 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
128 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
129 | 
130 | /* Additional cl_context_properties  */
131 | #define CL_GL_CONTEXT_KHR                       0x2008
132 | #define CL_EGL_DISPLAY_KHR                      0x2009
133 | #define CL_GLX_DISPLAY_KHR                      0x200A
134 | #define CL_WGL_HDC_KHR                          0x200B
135 | #define CL_CGL_SHAREGROUP_KHR                   0x200C
136 | 
137 | extern CL_API_ENTRY cl_int CL_API_CALL
138 | clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
139 |                       cl_gl_context_info            /* param_name */,
140 |                       size_t                        /* param_value_size */,
141 |                       void *                        /* param_value */,
142 |                       size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
143 | 
144 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
145 |     const cl_context_properties * properties,
146 |     cl_gl_context_info            param_name,
147 |     size_t                        param_value_size,
148 |     void *                        param_value,
149 |     size_t *                      param_value_size_ret);
150 | 
151 | #ifdef __cplusplus
152 | }
153 | #endif
154 | 
155 | #endif  /* __OPENCL_CL_GL_H  */
156 | 


--------------------------------------------------------------------------------
/strided-access/CL/cl_gl_ext.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************************
 2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and/or associated documentation files (the
 6 |  * "Materials"), to deal in the Materials without restriction, including
 7 |  * without limitation the rights to use, copy, modify, merge, publish,
 8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
 9 |  * permit persons to whom the Materials are furnished to do so, subject to
10 |  * the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included
13 |  * in all copies or substantial portions of the Materials.
14 |  *
15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
22 |  **********************************************************************************/
23 | 
24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
25 | 
26 | /* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
27 | /* OpenGL dependencies.                                                         */
28 | 
29 | #ifndef __OPENCL_CL_GL_EXT_H
30 | #define __OPENCL_CL_GL_EXT_H
31 | 
32 | #ifdef __cplusplus
33 | extern "C" {
34 | #endif
35 | 
36 | #ifdef __APPLE__
37 |     #include <OpenCL/cl_gl.h>
38 | #else
39 |     #include <CL/cl_gl.h>
40 | #endif
41 | 
42 | /*
43 |  * For each extension, follow this template
44 |  * /* cl_VEN_extname extension  */
45 | /* #define cl_VEN_extname 1
46 |  * ... define new types, if any
47 |  * ... define new tokens, if any
48 |  * ... define new APIs, if any
49 |  *
50 |  *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
51 |  *  This allows us to avoid having to decide whether to include GL headers or GLES here.
52 |  */
53 | 
54 | /* 
55 |  *  cl_khr_gl_event  extension
56 |  *  See section 9.9 in the OpenCL 1.1 spec for more information
57 |  */
58 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
59 | 
60 | extern CL_API_ENTRY cl_event CL_API_CALL
61 | clCreateEventFromGLsyncKHR(cl_context           /* context */,
62 |                            cl_GLsync            /* cl_GLsync */,
63 |                            cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
64 | 
65 | #ifdef __cplusplus
66 | }
67 | #endif
68 | 
69 | #endif	/* __OPENCL_CL_GL_EXT_H  */
70 | 


--------------------------------------------------------------------------------
/strided-access/CL/opencl.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and/or associated documentation files (the
 6 |  * "Materials"), to deal in the Materials without restriction, including
 7 |  * without limitation the rights to use, copy, modify, merge, publish,
 8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
 9 |  * permit persons to whom the Materials are furnished to do so, subject to
10 |  * the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included
13 |  * in all copies or substantial portions of the Materials.
14 |  *
15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
22 |  ******************************************************************************/
23 | 
24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
25 | 
26 | #ifndef __OPENCL_H
27 | #define __OPENCL_H
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #ifdef __APPLE__
34 | 
35 | #include <OpenCL/cl.h>
36 | #include <OpenCL/cl_gl.h>
37 | #include <OpenCL/cl_gl_ext.h>
38 | #include <OpenCL/cl_ext.h>
39 | 
40 | #else
41 | 
42 | #include <CL/cl.h>
43 | #include <CL/cl_gl.h>
44 | #include <CL/cl_gl_ext.h>
45 | #include <CL/cl_ext.h>
46 | 
47 | #endif
48 | 
49 | #ifdef __cplusplus
50 | }
51 | #endif
52 | 
53 | #endif  /* __OPENCL_H   */
54 | 
55 | 


--------------------------------------------------------------------------------
/strided-access/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Karl Rupp
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/strided-access/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | CXXFLAGS += -std=c++11 -O3
 5 | 
 6 | strided-access: benchmark-hip.cpp
 7 | ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
 8 | 	${HIPCC} ${CXXFLAGS} -o $@ $^ 
 9 | else
10 | 	$(error "Cannot find $(HIPCC), please install HIP toolkit")
11 | endif
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	rm -f strided-access *.o
17 | 


--------------------------------------------------------------------------------
/strided-access/README.txt:
--------------------------------------------------------------------------------
 1 | ###
 2 | ### README for measuring effective memory bandwidth for strided array access
 3 | ### by Karl Rupp
 4 | ### 
 5 | ### Supplements blog post:
 6 | ### https://www.karlrupp.net/2016/02/strided-memory-access-on-cpus-gpus-and-mic
 7 | ###
 8 | 
 9 | # License
10 | 
11 | The code is provided under a permissive MIT/X11-style license.
12 | See file LICENSE.txt for details.
13 | 
14 | The results and plotting scripts in folder results/ are provided under the
15 | Creative Commons Attribution 4.0 International (CC BY 4.0)
16 | license, see results/LICENSE.txt
17 | 
18 | 
19 | # Build
20 | 
21 | To build the executable, use (or adjust) one of the following commands to your environment:
22 | 
23 | HIP:
24 |  $> /opt/rocm/hip/bin/hipcc -std=c++11 -O3 -o hip benchmark-hip.cpp 
25 | 
26 | CUDA:
27 |  $> nvcc benchmark-cuda.cu -arch=sm_20 -I$VIENNACLPATH
28 | 
29 | OpenCL:
30 |  $> g++ benchmark-opencl.cpp -I. -lOpenCL -L/usr/local/cuda/lib64/
31 |  (If OpenCL is available system-wide, you may be able to omit the -L flag)
32 | 
33 | OpenMP:
34 |  $> g++ benchmark-openmp.cpp benchmark-openmp2.cpp -I. -O3 -fopenmp
35 | for CPUs or
36 |  $> icc benchmark-openmp.cpp benchmark-openmp2.cpp -O3 -fopenmp -mmic
37 | for Xeon Phi
38 | 
39 | 
40 | # Run
41 | 
42 | To run the respective benchmark, issue
43 |  $> ./a.out
44 | 
45 | 
46 | # Plot
47 | 
48 | Have a look at the results/ folder, where the data and gnuplot commands are located.
49 | Replot via
50 |  $> gnuplot plot.gnuplot
51 | (produces strided-access.eps)
52 | 
53 | Convert to .pdf via
54 |  $> epstopdf strided-access.eps
55 | and to .png using ImageMagick, e.g.
56 |  $> convert -density 300 strided-access.eps -resize 1150x strided-access.png
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-cuda.cu:
--------------------------------------------------------------------------------
 1 | //
 2 | // CUDA benchmark for measuring effective memory bandwidth for strided array access
 3 | //
 4 | // Author: Karl Rupp,  me@karlrupp.net
 5 | // License: MIT/X11 license, see file LICENSE.txt
 6 | //
 7 | 
 8 | #include <iostream>
 9 | #include <string>
10 | #include <vector>
11 | #include <sstream>
12 | #include <stdexcept>
13 | 
14 | #include <cuda_runtime.h>
15 | 
16 | #include "benchmark-utils.hpp"
17 | 
18 | 
19 | inline void cuda_last_error_check()
20 | {
21 |   cudaError_t error_code = cudaGetLastError();
22 | 
23 |   if (cudaSuccess != error_code)
24 |   {
25 |     std::stringstream ss;
26 |     ss << "CUDA Runtime API error " << error_code << ": " << cudaGetErrorString( error_code ) << std::endl;
27 |     throw std::runtime_error(ss.str());
28 |   }
29 | }
30 | 
31 | 
32 | // Kernel for the benchmark
33 | template<typename NumericT>
34 | __global__ void elementwise_add(const NumericT * x,
35 |                                 const NumericT * y,
36 |                                       NumericT * z,
37 |                                 unsigned int stride,
38 |                                 unsigned int size)
39 | {
40 |   for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
41 |                     i < size;
42 |                     i += gridDim.x * blockDim.x)
43 |     z[i*stride] = x[i*stride] + y[i*stride];
44 | }
45 | 
46 | 
47 | int main(int argc, char **argv)
48 | {
49 |   typedef float       NumericT;
50 | 
51 |   cudaDeviceProp prop;
52 |   cudaError_t err = cudaGetDeviceProperties(&prop, 0); if (err != cudaSuccess) throw std::runtime_error("Failed to get CUDA device name");
53 |   std::cout << "# Using device: " << prop.name << std::endl;
54 | 
55 |   // Set up work vectors
56 |   std::size_t N =  1000000;
57 | 
58 |   std::vector<NumericT> host_x(32*N);
59 |   NumericT *x, *y, *z;
60 | 
61 |   err = cudaMalloc(&x, sizeof(NumericT) * 32 * N); if (err != cudaSuccess) throw std::runtime_error("Failed to allocate CUDA memory for x");
62 |   err = cudaMalloc(&y, sizeof(NumericT) * 32 * N); if (err != cudaSuccess) throw std::runtime_error("Failed to allocate CUDA memory for y");
63 |   err = cudaMalloc(&z, sizeof(NumericT) * 32 * N); if (err != cudaSuccess) throw std::runtime_error("Failed to allocate CUDA memory for z");
64 | 
65 | 
66 |   // Warmup calculation:
67 |   elementwise_add<<<256, 256>>>(x, y, z,
68 |                                 static_cast<unsigned int>(1),
69 |                                 static_cast<unsigned int>(N));
70 |   cuda_last_error_check();
71 | 
72 |   // Benchmark runs
73 |   Timer timer;
74 |   std::cout << "# stride     time       GB/sec" << std::endl;
75 |   for (std::size_t stride = 1; stride <= 32 ; ++stride)
76 |   {
77 |     cudaDeviceSynchronize();
78 |     timer.start();
79 | 
80 |     // repeat calculation several times, then average
81 |     for (std::size_t num_runs = 0; num_runs < 20; ++num_runs)
82 |     {
83 |       elementwise_add<<<256, 256>>>(x, y, z,
84 |                                     static_cast<unsigned int>(stride),
85 |                                     static_cast<unsigned int>(N));
86 |       cuda_last_error_check();
87 |     }
88 |     cudaDeviceSynchronize();
89 |     double exec_time = timer.get();
90 | 
91 |     std::cout << "   " << stride << "        " << exec_time << "        " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl;
92 |   }
93 | 
94 |   return EXIT_SUCCESS;
95 | }
96 | 
97 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-hip.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // HIP benchmark for measuring effective memory bandwidth for strided array access
 3 | //
 4 | // Author: Karl Rupp,  me@karlrupp.net
 5 | // License: MIT/X11 license, see file LICENSE.txt
 6 | //
 7 | 
 8 | #include <iostream>
 9 | #include <string>
10 | #include <vector>
11 | #include <sstream>
12 | #include <stdexcept>
13 | 
14 | #include "hip/hip_runtime.h"
15 | 
16 | #include "benchmark-utils.hpp"
17 | 
18 | 
19 | inline void cuda_last_error_check()
20 | {
21 |   hipError_t error_code = hipGetLastError();
22 | 
23 |   if (hipSuccess != error_code)
24 |   {
25 |     std::stringstream ss;
26 |     ss << "CUDA Runtime API error " << error_code << ": " << hipGetErrorString( error_code ) << std::endl;
27 |     throw std::runtime_error(ss.str());
28 |   }
29 | }
30 | 
31 | 
32 | // Kernel for the benchmark
33 | template<typename NumericT>
34 | __global__ void elementwise_add(const NumericT * x,
35 |                                 const NumericT * y,
36 |                                       NumericT * z,
37 |                                 unsigned int stride,
38 |                                 unsigned int size)
39 | {
40 |   for (unsigned int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
41 |                     i < size;
42 |                     i += hipGridDim_x * hipBlockDim_x)
43 |     z[i*stride] = x[i*stride] + y[i*stride];
44 | }
45 | 
46 | 
47 | int main(int argc, char **argv)
48 | {
49 |   typedef float       NumericT;
50 | 
51 |   hipDeviceProp_t prop;
52 |   hipError_t err = hipGetDeviceProperties(&prop, 0); if (err != hipSuccess) throw std::runtime_error("Failed to get CUDA device name");
53 |   std::cout << "# Using device: " << prop.name << std::endl;
54 | 
55 |   // Set up work vectors
56 |   std::size_t N =  1000000;
57 | 
58 |   std::vector<NumericT> host_x(32*N);
59 |   NumericT *x, *y, *z;
60 | 
61 |   err = hipMalloc(&x, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for x");
62 |   err = hipMalloc(&y, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for y");
63 |   err = hipMalloc(&z, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for z");
64 | 
65 | 
66 |   // Warmup calculation:
67 |   hipLaunchKernelGGL(elementwise_add<NumericT>, dim3(256), dim3(256), 0, 0, x, y, z,
68 |                                 static_cast<unsigned int>(1),
69 |                                 static_cast<unsigned int>(N));
70 |   cuda_last_error_check();
71 | 
72 |   // Benchmark runs
73 |   Timer timer;
74 |   std::cout << "# stride     time       GB/sec" << std::endl;
75 |   for (std::size_t stride = 0; stride <= 32 ; ++stride)
76 |   {
77 |     hipDeviceSynchronize();
78 |     timer.start();
79 | 
80 |     // repeat calculation several times, then average
81 |     for (std::size_t num_runs = 0; num_runs < 20; ++num_runs)
82 |     {
83 |       hipLaunchKernelGGL(elementwise_add<NumericT>, dim3(256), dim3(256), 0, 0, x, y, z,
84 |                                     static_cast<unsigned int>(stride),
85 |                                     static_cast<unsigned int>(N));
86 |       cuda_last_error_check();
87 |     }
88 |     hipDeviceSynchronize();
89 |     double exec_time = timer.get();
90 | 
91 |     std::cout << "   " << stride << "        " << exec_time << "        " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl;
92 |   }
93 | 
94 |   return EXIT_SUCCESS;
95 | }
96 | 
97 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-hip.cu:
--------------------------------------------------------------------------------
 1 | //
 2 | // CUDA benchmark for measuring effective memory bandwidth for strided array access
 3 | //
 4 | // Author: Karl Rupp,  me@karlrupp.net
 5 | // License: MIT/X11 license, see file LICENSE.txt
 6 | //
 7 | 
 8 | #include <iostream>
 9 | #include <string>
10 | #include <vector>
11 | #include <sstream>
12 | #include <stdexcept>
13 | 
14 | #include "hip/hip_runtime.h"
15 | 
16 | #include "benchmark-utils.hpp"
17 | 
18 | 
19 | inline void cuda_last_error_check()
20 | {
21 |   hipError_t error_code = hipGetLastError();
22 | 
23 |   if (hipSuccess != error_code)
24 |   {
25 |     std::stringstream ss;
26 |     ss << "CUDA Runtime API error " << error_code << ": " << hipGetErrorString( error_code ) << std::endl;
27 |     throw std::runtime_error(ss.str());
28 |   }
29 | }
30 | 
31 | 
32 | // Kernel for the benchmark
33 | template<typename NumericT>
34 | __global__ void elementwise_add(hipLaunchParm lp,
35 |                                 const NumericT * x,
36 |                                 const NumericT * y,
37 |                                       NumericT * z,
38 |                                 unsigned int stride,
39 |                                 unsigned int size)
40 | {
41 |   for (unsigned int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
42 |                     i < size;
43 |                     i += hipGridDim_x * hipBlockDim_x)
44 |     z[i*stride] = x[i*stride] + y[i*stride];
45 | }
46 | 
47 | 
48 | int main(int argc, char **argv)
49 | {
50 |   typedef float       NumericT;
51 | 
52 |   hipDeviceProp_t prop;
53 |   hipError_t err = hipGetDeviceProperties(&prop, 0); if (err != hipSuccess) throw std::runtime_error("Failed to get CUDA device name");
54 |   std::cout << "# Using device: " << prop.name << std::endl;
55 | 
56 |   // Set up work vectors
57 |   std::size_t N =  1000000;
58 | 
59 |   std::vector<NumericT> host_x(32*N);
60 |   NumericT *x, *y, *z;
61 | 
62 |   err = hipMalloc(&x, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for x");
63 |   err = hipMalloc(&y, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for y");
64 |   err = hipMalloc(&z, sizeof(NumericT) * 32 * N); if (err != hipSuccess) throw std::runtime_error("Failed to allocate CUDA memory for z");
65 | 
66 | 
67 |   // Warmup calculation:
68 |   hipLaunchKernel(elementwise_add, dim3(256), dim3(256), 0, 0, x, y, z,
69 |                                 static_cast<unsigned int>(1),
70 |                                 static_cast<unsigned int>(N));
71 |   cuda_last_error_check();
72 | 
73 |   // Benchmark runs
74 |   Timer timer;
75 |   std::cout << "# stride     time       GB/sec" << std::endl;
76 |   for (std::size_t stride = 1; stride <= 32 ; ++stride)
77 |   {
78 |     hipDeviceSynchronize();
79 |     timer.start();
80 | 
81 |     // repeat calculation several times, then average
82 |     for (std::size_t num_runs = 0; num_runs < 20; ++num_runs)
83 |     {
84 |       hipLaunchKernel(elementwise_add, dim3(256), dim3(256), 0, 0, x, y, z,
85 |                                     static_cast<unsigned int>(stride),
86 |                                     static_cast<unsigned int>(N));
87 |       cuda_last_error_check();
88 |     }
89 |     hipDeviceSynchronize();
90 |     double exec_time = timer.get();
91 | 
92 |     std::cout << "   " << stride << "        " << exec_time << "        " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl;
93 |   }
94 | 
95 |   return EXIT_SUCCESS;
96 | }
97 | 
98 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-opencl.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // OpenCL benchmark for measuring effective memory bandwidth for strided array access
  3 | //
  4 | // Author: Karl Rupp,  me@karlrupp.net
  5 | // License: MIT/X11 license, see file LICENSE.txt
  6 | //
  7 | 
  8 | #include <iostream>
  9 | #include <string>
 10 | #include <vector>
 11 | #include <sstream>
 12 | 
 13 | #include "benchmark-utils.hpp"
 14 | 
 15 | #ifdef __APPLE__
 16 | #include <OpenCL/cl.h>
 17 | #else
 18 | #include <CL/cl.h>
 19 | #endif
 20 | 
 21 | // OpenCL error checking
 22 | #define ERROR_CHECKER_CASE(ERRORCODE)  case ERRORCODE: throw std::runtime_error("#ERRORCODE");
 23 | static void checkError(cl_int err)
 24 | {
 25 |   if (err != CL_SUCCESS)
 26 |   {
 27 |     switch (err)
 28 |     {
 29 |       ERROR_CHECKER_CASE(CL_DEVICE_NOT_FOUND);
 30 |       ERROR_CHECKER_CASE(CL_DEVICE_NOT_AVAILABLE);
 31 |       ERROR_CHECKER_CASE(CL_COMPILER_NOT_AVAILABLE);
 32 |       ERROR_CHECKER_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
 33 |       ERROR_CHECKER_CASE(CL_OUT_OF_RESOURCES);
 34 |       ERROR_CHECKER_CASE(CL_OUT_OF_HOST_MEMORY);
 35 |       ERROR_CHECKER_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
 36 |       ERROR_CHECKER_CASE(CL_MEM_COPY_OVERLAP);
 37 |       ERROR_CHECKER_CASE(CL_IMAGE_FORMAT_MISMATCH);
 38 |       ERROR_CHECKER_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
 39 |       ERROR_CHECKER_CASE(CL_BUILD_PROGRAM_FAILURE);
 40 |       ERROR_CHECKER_CASE(CL_MAP_FAILURE);
 41 | 
 42 |       ERROR_CHECKER_CASE(CL_INVALID_VALUE);
 43 |       ERROR_CHECKER_CASE(CL_INVALID_DEVICE_TYPE);
 44 |       ERROR_CHECKER_CASE(CL_INVALID_PLATFORM);
 45 |       ERROR_CHECKER_CASE(CL_INVALID_DEVICE);
 46 |       ERROR_CHECKER_CASE(CL_INVALID_CONTEXT);
 47 |       ERROR_CHECKER_CASE(CL_INVALID_QUEUE_PROPERTIES);
 48 |       ERROR_CHECKER_CASE(CL_INVALID_COMMAND_QUEUE);
 49 |       ERROR_CHECKER_CASE(CL_INVALID_HOST_PTR);
 50 |       ERROR_CHECKER_CASE(CL_INVALID_MEM_OBJECT);
 51 |       ERROR_CHECKER_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
 52 |       ERROR_CHECKER_CASE(CL_INVALID_IMAGE_SIZE);
 53 |       ERROR_CHECKER_CASE(CL_INVALID_SAMPLER);
 54 |       ERROR_CHECKER_CASE(CL_INVALID_BINARY);
 55 |       ERROR_CHECKER_CASE(CL_INVALID_BUILD_OPTIONS);
 56 |       ERROR_CHECKER_CASE(CL_INVALID_PROGRAM);
 57 |       ERROR_CHECKER_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
 58 |       ERROR_CHECKER_CASE(CL_INVALID_KERNEL_NAME);
 59 |       ERROR_CHECKER_CASE(CL_INVALID_KERNEL_DEFINITION);
 60 |       ERROR_CHECKER_CASE(CL_INVALID_KERNEL);
 61 |       ERROR_CHECKER_CASE(CL_INVALID_ARG_INDEX);
 62 |       ERROR_CHECKER_CASE(CL_INVALID_ARG_VALUE);
 63 |       ERROR_CHECKER_CASE(CL_INVALID_ARG_SIZE);
 64 |       ERROR_CHECKER_CASE(CL_INVALID_KERNEL_ARGS);
 65 |       ERROR_CHECKER_CASE(CL_INVALID_WORK_DIMENSION);
 66 |       ERROR_CHECKER_CASE(CL_INVALID_WORK_GROUP_SIZE);
 67 |       ERROR_CHECKER_CASE(CL_INVALID_WORK_ITEM_SIZE);
 68 |       ERROR_CHECKER_CASE(CL_INVALID_GLOBAL_OFFSET);
 69 |       ERROR_CHECKER_CASE(CL_INVALID_EVENT_WAIT_LIST);
 70 |       ERROR_CHECKER_CASE(CL_INVALID_EVENT);
 71 |       ERROR_CHECKER_CASE(CL_INVALID_OPERATION);
 72 |       ERROR_CHECKER_CASE(CL_INVALID_GL_OBJECT);
 73 |       ERROR_CHECKER_CASE(CL_INVALID_BUFFER_SIZE);
 74 |       ERROR_CHECKER_CASE(CL_INVALID_MIP_LEVEL);
 75 |       ERROR_CHECKER_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
 76 |         
 77 |       default: throw std::runtime_error("Unknown error. Maybe OpenCL SDK not properly installed?");
 78 |     }
 79 |   }
 80 | }
 81 | 
 82 | #define ERR_CHECK(err) checkError(err);
 83 | 
 84 | 
 85 | 
 86 | // Kernel for the benchmark
 87 | static const char * benchmark_program =
 88 | "__kernel void elementwise_add(\n"
 89 | "          __global const float * x,\n"
 90 | "          __global const float * y, \n"
 91 | "          __global float * z,\n"
 92 | "          unsigned int stride,\n"
 93 | "          unsigned int size) \n"
 94 | "{ \n"
 95 | "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
 96 | "    z[i*stride] = x[i*stride] + y[i*stride];\n"
 97 | "};\n";
 98 | 
 99 | int main(int argc, char **argv)
100 | {
101 |   typedef float       NumericT;
102 | 
103 |   /////////////////////////// Part 1: Initialize OpenCL ///////////////////////////////////
104 |     
105 |   //
106 |   // Query platform:
107 |   //
108 |   cl_uint num_platforms;
109 |   cl_platform_id platform_ids[42];   //no more than 42 platforms supported...
110 |   cl_int err = clGetPlatformIDs(42, platform_ids, &num_platforms); ERR_CHECK(err);
111 | 
112 |   std::cout << "# Platforms found: " << num_platforms << std::endl;
113 |   for (cl_uint i=0; i<num_platforms; ++i)
114 |   {
115 |     char buffer[1024];
116 |     cl_int err;
117 |     err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR, 1024 * sizeof(char), buffer, NULL); ERR_CHECK(err);
118 |     
119 |     std::stringstream ss;
120 |     ss << "# (" << i << ") " << buffer << ": ";
121 | 
122 |     err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VERSION, 1024 * sizeof(char), buffer, NULL); ERR_CHECK(err);
123 | 
124 |     ss << buffer;
125 | 
126 |     std::cout << ss.str() << std::endl;
127 |   }
128 | 
129 |   std::size_t platform_index = 0;
130 |   if (num_platforms > 1)
131 |   {
132 |     std::cout << "# Enter platform index to use: ";
133 |     std::cin >> platform_index;
134 |     platform_index = std::min<std::size_t>(platform_index, num_platforms - 1);
135 |     std::cout << "#" << std::endl;
136 |   }
137 |   
138 |   //
139 |   // Query devices:
140 |   //
141 |   cl_device_id device_ids[42];
142 |   cl_uint num_devices;
143 |   err = clGetDeviceIDs(platform_ids[platform_index], CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices); ERR_CHECK(err);
144 |   std::cout << "# Devices found: " << num_devices << std::endl;
145 |   for (cl_uint i=0; i<num_devices; ++i)
146 |   {
147 |     char buffer[1024]; 
148 |     cl_int err;          
149 |     err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(char)*1024, &buffer, NULL); ERR_CHECK(err);
150 |     
151 |     std::cout << "# (" << i << ") " << buffer << std::endl;
152 |   }
153 | 
154 |   std::size_t device_index = 0;
155 |   if (num_devices > 1)
156 |   {
157 |     std::cout << "# Enter index of device to use: ";
158 |     std::cin >> device_index;
159 |     device_index = std::min<std::size_t>(device_index, num_devices - 1);
160 |     std::cout << "#" << std::endl;
161 |   }
162 | 
163 |   // now set up a context containing the selected device:
164 |   cl_context my_context = clCreateContext(0, 1, &(device_ids[device_index]), NULL, NULL, &err); ERR_CHECK(err);
165 |    
166 |   // create a command queue for the device:
167 |   cl_command_queue queue = clCreateCommandQueue(my_context, device_ids[device_index], 0, &err); ERR_CHECK(err);
168 | 
169 |   
170 |   cl_program my_program = clCreateProgramWithSource(my_context, 1, &benchmark_program, NULL, &err); ERR_CHECK(err);
171 |   err = clBuildProgram(my_program, 0, NULL, NULL, NULL, NULL);
172 |   if (err != CL_SUCCESS)
173 |   {
174 |     char buffer[8192];
175 |     cl_build_status status;
176 |     std::cout << "Build Scalar: Err = " << err;
177 |     err = clGetProgramBuildInfo(my_program, device_ids[device_index], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); ERR_CHECK(err);
178 |     err = clGetProgramBuildInfo(my_program, device_ids[device_index], CL_PROGRAM_BUILD_LOG,    sizeof(char)*8192, &buffer, NULL); ERR_CHECK(err);
179 |     std::cout << " Status = " << status << std::endl;
180 |     std::cout << "Log: " << buffer << std::endl;
181 |     std::cout << "Sources: " << benchmark_program << std::endl;
182 |   }
183 |   cl_kernel my_kernel = clCreateKernel(my_program, "elementwise_add", &err); ERR_CHECK(err);
184 | 
185 |   /////////////////////////// Part 2: Run benchmark ///////////////////////////////////
186 |     
187 | 
188 |   // Set up work vectors
189 |   cl_uint N = 1000000;
190 |   std::vector<NumericT> host_x(32*N);
191 |   cl_mem x = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 32*N*sizeof(NumericT), &(host_x[0]), &err); ERR_CHECK(err);
192 |   cl_mem y = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 32*N*sizeof(NumericT), &(host_x[0]), &err); ERR_CHECK(err);
193 |   cl_mem z = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 32*N*sizeof(NumericT), &(host_x[0]), &err); ERR_CHECK(err);
194 | 
195 |   // Warmup calculation:
196 |   size_t localsize = 256;
197 |   size_t globalsize = 256 * localsize;
198 |   cl_uint stride = 1;
199 |   err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), &x); ERR_CHECK(err);
200 |   err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), &y); ERR_CHECK(err);
201 |   err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), &z); ERR_CHECK(err);
202 |   err = clSetKernelArg(my_kernel, 3, sizeof(cl_uint), &stride); ERR_CHECK(err);
203 |   err = clSetKernelArg(my_kernel, 4, sizeof(cl_uint), &N); ERR_CHECK(err);
204 |   err = clEnqueueNDRangeKernel(queue, my_kernel, 1, NULL, &globalsize, &localsize, 0, NULL, NULL); ERR_CHECK(err);
205 | 
206 |   // Benchmark runs
207 |   Timer timer;
208 |   char device_name[1024];
209 |   err = clGetDeviceInfo(device_ids[device_index], CL_DEVICE_NAME, 1024, device_name, NULL); ERR_CHECK(err);
210 |   std::cout << "# Using device: " << device_name << std::endl;
211 |   std::cout << "# stride     time       GB/sec" << std::endl;
212 |   for (; stride <= 32; ++stride)
213 |   {
214 |     err = clFinish(queue); ERR_CHECK(err);
215 |     err = clSetKernelArg(my_kernel, 3, sizeof(cl_uint), &stride); ERR_CHECK(err);
216 | 
217 |     // repeat calculation several times, then average
218 |     timer.start();
219 |     for (std::size_t num_runs = 0; num_runs < 20; ++num_runs)
220 |     {
221 |       err = clEnqueueNDRangeKernel(queue, my_kernel, 1, NULL, &globalsize, &localsize, 0, NULL, NULL); ERR_CHECK(err);
222 |     }
223 |     err = clFinish(queue); ERR_CHECK(err);
224 |     double exec_time = timer.get();
225 | 
226 |     std::cout << "     " << stride << "        " << exec_time << "        " <<  20.0 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl;
227 |   }
228 | 
229 |   return EXIT_SUCCESS;
230 | }
231 | 
232 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-openmp.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // OpenMP benchmark for measuring effective memory bandwidth for strided array access
 3 | //
 4 | // Author: Karl Rupp,  me@karlrupp.net
 5 | // License: MIT/X11 license, see file LICENSE.txt
 6 | //
 7 | 
 8 | #include <iostream>
 9 | #include <string>
10 | #include <vector>
11 | #include <sstream>
12 | #include <stdexcept>
13 | #include <cstdlib>
14 | 
15 | #include "benchmark-utils.hpp"
16 | 
17 | typedef float       NumericT;
18 | 
19 | int kernel_func(NumericT *x, NumericT const *y, NumericT const *z, int stride, int N);
20 | 
21 | int main(int argc, char **argv)
22 | {
23 | 
24 |   // slightly larger on CPU than on GPU so that arrays don't fit in cache
25 |   std::size_t N = 5000000;
26 | 
27 |   // Note: Run only on a single NUMA domain. std::vector<T> has terrible first-touch semantics
28 |   NumericT *x; if (posix_memalign((void**)&x, 64, 32*N*sizeof(NumericT))) throw std::runtime_error("Failed to allocate x");
29 |   NumericT *y; if (posix_memalign((void**)&y, 64, 32*N*sizeof(NumericT))) throw std::runtime_error("Failed to allocate y");
30 |   NumericT *z; if (posix_memalign((void**)&z, 64, 32*N*sizeof(NumericT))) throw std::runtime_error("Failed to allocate z");
31 | 
32 |   #pragma omp parallel for
33 |   for (std::size_t i=0; i<32*N; ++i)
34 |   {
35 |     x[i] = 1.0;
36 |     y[i] = 2.0;
37 |     z[i] = 3.0;
38 |   }
39 | 
40 | 
41 |   // warmup:
42 |   kernel_func(&x[0], &y[0], &z[0], 1, N);
43 | 
44 | 
45 |   // Benchmark runs
46 |   Timer timer;
47 |   std::cout << "# stride     time       GB/sec" << std::endl;
48 |   for (std::size_t stride = 1; stride <= 32 ; ++stride)
49 |   {
50 |     timer.start();
51 | 
52 |     // repeat calculation several times, then average
53 |     for (std::size_t num_runs = 0; num_runs < 20; ++num_runs)
54 |     {
55 |       kernel_func(&x[0], &y[0], &z[0], stride, N);
56 |     }
57 |     double exec_time = timer.get();
58 | 
59 |     std::cout << "   " << stride << "        " << exec_time << "        " << 20 * 3.0 * sizeof(NumericT) * N / exec_time * 1e-9 << std::endl;
60 |   }
61 | 
62 |   return EXIT_SUCCESS;
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-openmp2.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | typedef float       NumericT;
 3 | 
 4 | int kernel_func(NumericT *x, NumericT const *y, NumericT const *z, int stride, int N)
 5 | {
 6 |   if (stride == 1)
 7 |   {
 8 |     #pragma omp parallel for
 9 |     for (int i=0; i<N; ++i)
10 |       x[i] = y[i] + z[i];
11 |   }
12 |   else
13 |   {
14 |     #pragma omp parallel for
15 |     for (int i=0; i<N; ++i)
16 |       x[i*stride] = y[i*stride] + z[i*stride];
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/strided-access/benchmark-utils.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef BENCHMARK_UTILS_HPP_
 2 | #define BENCHMARK_UTILS_HPP_
 3 | 
 4 | /*
 5 | * Copyright (c) 2016, Karl Rupp
 6 | *
 7 | * License: MIT/X11, see file LICENSE.txt
 8 | */
 9 | 
10 | #include <stdexcept>
11 | 
12 | #ifdef _WIN32
13 | 
14 | #define WINDOWS_LEAN_AND_MEAN
15 | #include <windows.h>
16 | #undef min
17 | #undef max
18 | 
19 | class Timer
20 | {
21 | public:
22 | 
23 |   Timer() { QueryPerformanceFrequency(&freq); }
24 | 
25 |   void start() { QueryPerformanceCounter((LARGE_INTEGER*) &start_time); }
26 | 
27 |   double get() const
28 |   {
29 |     LARGE_INTEGER  end_time;
30 |     QueryPerformanceCounter((LARGE_INTEGER*) &end_time);
31 |     return (static_cast<double>(end_time.QuadPart) - static_cast<double>(start_time.QuadPart)) / static_cast<double>(freq.QuadPart);
32 |   }
33 | 
34 | private:
35 |   LARGE_INTEGER freq;
36 |   LARGE_INTEGER start_time;
37 | };
38 | 
39 | #else
40 | 
41 | #include <sys/time.h>
42 | 
43 | class Timer
44 | {
45 | public:
46 | 
47 |   Timer() : ts(0)
48 |   {}
49 | 
50 |   void start()
51 |   {
52 |     struct timeval tval;
53 |     gettimeofday(&tval, NULL);
54 |     ts = tval.tv_sec * 1000000 + tval.tv_usec;
55 |   }
56 | 
57 |   double get() const
58 |   {
59 |     struct timeval tval;
60 |     gettimeofday(&tval, NULL);
61 |     double end_time = tval.tv_sec * 1000000 + tval.tv_usec;
62 | 
63 |     return static_cast<double>(end_time-ts) / 1000000.0;
64 |   }
65 | 
66 | private:
67 |   double ts;
68 | };
69 | 
70 | 
71 | #endif
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/strided-access/results/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | All data in this archive is available under the following creative commons license:
 2 | 
 3 |     Attribution 4.0 International (CC BY 4.0)
 4 |     http://creativecommons.org/licenses/by/4.0/
 5 | 
 6 | You are free to:
 7 | 
 8 |     Share — copy and redistribute the material in any medium or format
 9 |     Adapt — remix, transform, and build upon the material
10 |     for any purpose, even commercially.
11 | 
12 |     The licensor cannot revoke these freedoms as long as you follow the license terms.
13 | 
14 | Under the following terms:
15 | 
16 |     Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
17 | 
18 |     No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
19 | 
20 | Notices:
21 | 
22 |     You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
23 |     No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
24 | 
25 | 


--------------------------------------------------------------------------------
/strided-access/results/k20m.txt:
--------------------------------------------------------------------------------
 1 | # Using device: Tesla K20m
 2 | # stride     time       GB/sec
 3 |    1        0.00186        129.032
 4 |    2        0.005073        47.3093
 5 |    3        0.006904        34.7625
 6 |    4        0.009935        24.157
 7 |    5        0.012335        19.4568
 8 |    6        0.013922        17.2389
 9 |    7        0.017785        13.4945
10 |    8        0.018577        12.9192
11 |    9        0.019592        12.2499
12 |    10        0.01994        12.0361
13 |    11        0.020641        11.6273
14 |    12        0.02157        11.1266
15 |    13        0.022427        10.7014
16 |    14        0.02317        10.3582
17 |    15        0.024805        9.67547
18 |    16        0.025235        9.5106
19 |    17        0.026862        8.93455
20 |    18        0.028229        8.5019
21 |    19        0.029781        8.05883
22 |    20        0.032167        7.46106
23 |    21        0.032969        7.27957
24 |    22        0.034717        6.91304
25 |    23        0.036391        6.59504
26 |    24        0.037984        6.31845
27 |    25        0.040361        5.94633
28 |    26        0.041379        5.80004
29 |    27        0.043162        5.56045
30 |    28        0.044556        5.38648
31 |    29        0.046384        5.1742
32 |    30        0.048085        4.99116
33 |    31        0.049665        4.83238
34 |    32        0.050811        4.72339
35 | 
36 | 


--------------------------------------------------------------------------------
/strided-access/results/plot.gnuplot:
--------------------------------------------------------------------------------
 1 | set terminal postscript enhanced color eps
 2 | 
 3 | set style data lines
 4 | set style line 1  linetype -1 linewidth 3 lc rgb "#AA0000"
 5 | set style line 2  linetype -1 linewidth 3 lc rgb "#0000AA"
 6 | set style line 3  linetype -1 linewidth 3 lc rgb "#000000"
 7 | set style line 4  linetype -1 linewidth 3 lc rgb "#00AA00"
 8 | set style line 5  linetype  2 linewidth 3 lc rgb "#00AA00"
 9 | set style line 6  linetype -1 linewidth 3 lc rgb "#00AA00"
10 | set style line 7  linetype  2 linewidth 3 lc rgb "#000000"
11 | set style line 8  linetype -1 linewidth 3 lc rgb "#000000"
12 | set style increment user
13 | 
14 | set size 0.75,0.75
15 | #set size ratio 0.66
16 | set border lw 2
17 | 
18 | set key top right Right
19 | set grid
20 | set logscale y
21 | set xrange [1:16]
22 | 
23 | #######
24 | 
25 | set output "strided-access.eps"
26 | set title "Memory Bandwidth for Strided Array Access\n{/*0.7 x[i*stride] = y[i*stride] + z[i*stride]}"
27 | set ylabel "Memory Bandwidth (GB/sec)"
28 | set xlabel "Stride (4 Bytes per Element)"
29 | plot 'w9100.txt'           using 1:3 with linesp ls 1 pt  5 ps 1.5 title "AMD FirePro W9100", \
30 |      'xeon-e5-2670v3.txt'  using 1:3 with linesp ls 2 pt  7 ps 1.5 title "1x INTEL Xeon E5-2670v3", \
31 |      'xeon-phi-7120.txt'   using 1:3 with linesp ls 3 pt  9 ps 2 title "INTEL Xeon Phi 7120", \
32 |      'k20m.txt'            using 1:3 with linesp ls 4 pt 11 ps 2   title "NVIDIA Tesla K20m"
33 |      
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/strided-access/results/strided-access.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/strided-access/results/strided-access.pdf


--------------------------------------------------------------------------------
/strided-access/results/strided-access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/HIP-Examples/cdf9d101acd9a3fc89ee750f73c1f1958cbd5cc3/strided-access/results/strided-access.png


--------------------------------------------------------------------------------
/strided-access/results/w9100.txt:
--------------------------------------------------------------------------------
 1 | # Platforms found: 1
 2 | # (0) Advanced Micro Devices, Inc.: OpenCL 1.2 AMD-APP (1573.4)
 3 | # Devices found: 2
 4 | # (0) Hawaii
 5 | # (1) AMD Phenom(tm) II X4 955 Processor
 6 | # Enter index of device to use: 0
 7 | #
 8 | # Using device: Hawaii
 9 | # stride     time       GB/sec
10 |      1        0.00109        220.183
11 |      2        0.002025        118.519
12 |      3        0.003026        79.3126
13 |      4        0.003911        61.3654
14 |      5        0.00515        46.6019
15 |      6        0.006224        38.5604
16 |      7        0.007256        33.0761
17 |      8        0.007685        31.2297
18 |      9        0.009201        26.0841
19 |      10        0.010038        23.9091
20 |      11        0.010782        22.2593
21 |      12        0.011546        20.7864
22 |      13        0.012666        18.9484
23 |      14        0.013456        17.8359
24 |      15        0.014238        16.8563
25 |      16        0.014342        16.7341
26 |      17        0.015833        15.1582
27 |      18        0.016072        14.9328
28 |      19        0.016318        14.7077
29 |      20        0.016113        14.8948
30 |      21        0.016611        14.4483
31 |      22        0.016743        14.3343
32 |      23        0.016828        14.2619
33 |      24        0.016698        14.373
34 |      25        0.01691        14.1928
35 |      26        0.017249        13.9139
36 |      27        0.017067        14.0622
37 |      28        0.016928        14.1777
38 |      29        0.017321        13.856
39 |      30        0.016969        14.1434
40 |      31        0.016771        14.3104
41 |      32        0.015675        15.311
42 | 
43 | 


--------------------------------------------------------------------------------
/strided-access/results/xeon-e5-2670v3.txt:
--------------------------------------------------------------------------------
 1 | # stride     time       GB/sec
 2 |    1        0.042428        28.2832
 3 |    2        0.120377        9.96868
 4 |    3        0.168107        7.13831
 5 |    4        0.235153        5.10306
 6 |    5        0.311109        3.85717
 7 |    6        0.355833        3.37237
 8 |    7        0.431296        2.78231
 9 |    8        0.502569        2.38773
10 |    9        0.549516        2.18374
11 |    10        0.624792        1.92064
12 |    11        0.673498        1.78174
13 |    12        0.746097        1.60837
14 |    13        0.820617        1.46231
15 |    14        0.861678        1.39263
16 |    15        0.936427        1.28147
17 |    16        1.00677        1.19193
18 |    17        1.03629        1.15798
19 |    18        1.09355        1.09734
20 |    19        1.12504        1.06663
21 |    20        1.17261        1.02336
22 |    21        1.22663        0.978291
23 |    22        1.249        0.960766
24 |    23        1.3017        0.921874
25 |    24        1.35292        0.88697
26 |    25        1.37572        0.872272
27 |    26        1.41969        0.845256
28 |    27        1.45273        0.826029
29 |    28        1.47944        0.811119
30 |    29        1.52076        0.789082
31 |    30        1.50888        0.795293
32 |    31        1.53889        0.779783
33 |    32        1.67142        0.71795
34 | 
35 | 


--------------------------------------------------------------------------------
/strided-access/results/xeon-phi-7120.txt:
--------------------------------------------------------------------------------
 1 | # stride     time       GB/sec
 2 |    1        0.013312        90.1442
 3 |    2        0.033645        35.6665
 4 |    3        0.049236        24.3724
 5 |    4        0.055954        21.4462
 6 |    5        0.063018        19.0422
 7 |    6        0.073359        16.3579
 8 |    7        0.083043        14.4503
 9 |    8        0.092531        12.9686
10 |    9        0.103568        11.5866
11 |    10        0.116298        10.3183
12 |    11        0.128292        9.35366
13 |    12        0.139058        8.62949
14 |    13        0.150226        7.98796
15 |    14        0.161199        7.44421
16 |    15        0.170664        7.03136
17 |    16        0.180439        6.65045
18 |    17        0.18679        6.42433
19 |    18        0.193852        6.19029
20 |    19        0.200235        5.99296
21 |    20        0.207103        5.79422
22 |    21        0.211277        5.67975
23 |    22        0.217728        5.51146
24 |    23        0.223153        5.37748
25 |    24        0.231584        5.18171
26 |    25        0.23512        5.10378
27 |    26        0.241932        4.96007
28 |    27        0.24567        4.8846
29 |    28        0.251481        4.77173
30 |    29        0.256766        4.67352
31 |    30        0.261103        4.59589
32 |    31        0.268795        4.46437
33 |    32        0.270861        4.43032
34 | 
35 | 


--------------------------------------------------------------------------------
/test_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | : ${HIP_PLATFORM:="hcc"}
 4 | 
 5 | # vector_add
 6 | echo
 7 | echo "==== vectorAdd ===="
 8 | cd vectorAdd
 9 | make clean
10 | make
11 | cd ..
12 | 
13 | # gpu-burn
14 | echo
15 | echo "==== gpu-burn ===="
16 | cd gpu-burn
17 | make clean
18 | make
19 | ./build/gpuburn-hip -t 5
20 | cd ..
21 | 
22 | # strided-access
23 | echo
24 | echo "==== strided-access ===="
25 | cd strided-access
26 | make clean
27 | make
28 | ./strided-access
29 | cd ..
30 | 
31 | 
32 | # rtm8
33 | echo
34 | echo "==== rtm8 ===="
35 | cd rtm8
36 | ./build_hip.sh
37 | ./rtm8_hip
38 | cd ..
39 | 
40 | # reduction
41 | echo
42 | echo "==== reduction ===="
43 | cd reduction
44 | make clean
45 | make
46 | ./run.sh
47 | cd ..
48 | 
49 | # mini-nbody
50 | echo
51 | echo "==== mini-nbody ===="
52 | cd mini-nbody/hip
53 | ./HIP-nbody-orig.sh
54 | ./HIP-nbody-soa.sh
55 | ./HIP-nbody-block.sh
56 | cd ../..
57 | 
58 | # add4
59 | echo
60 | echo "==== add4 ===="
61 | cd add4
62 | ./buildit.sh
63 | ./runhip.sh
64 | cd ..
65 | 
66 | # cuda-stream
67 | echo
68 | echo "==== cuda-stream ===="
69 | cd cuda-stream
70 | make clean
71 | make
72 | ./stream
73 | cd ..
74 | 
75 | # openmp-helloworld
76 | echo
77 | echo "==== OpenMP Hello World ===="
78 | cd openmp-helloworld
79 | mkdir -p build
80 | cd build
81 | cmake ..
82 | make
83 | ./test_openmp_helloworld
84 | cd ../..
85 | 
86 | 


--------------------------------------------------------------------------------
/vectorAdd/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_PATH?= $(wildcard /opt/rocm)
 2 | HIPCC=$(HIP_PATH)/bin/hipcc
 3 | 
 4 | SOURCES = vectoradd_hip.cpp 
 5 | OBJECTS = $(SOURCES:.cpp=.o)
 6 | 
 7 | EXECUTABLE=./vectoradd_hip.exe
 8 | 
 9 | .PHONY: test
10 | 
11 | 
12 | all: $(EXECUTABLE) test
13 | 
14 | CXXFLAGS =-g
15 | 
16 | CXX=$(HIPCC)
17 | 
18 | 
19 | $(EXECUTABLE): $(OBJECTS) 
20 | 	$(HIPCC) $(OBJECTS) -o $@
21 | 
22 | 
23 | test: $(EXECUTABLE)
24 | 	$(EXECUTABLE)
25 | 
26 | 
27 | clean:
28 | 	rm -f $(EXECUTABLE)
29 | 	rm -f $(OBJECTS)
30 | 	rm -f $(HIP_PATH)/src/*.o
31 | 


--------------------------------------------------------------------------------
/vectorAdd/README:
--------------------------------------------------------------------------------
1 | Simple vectorAdd example written directly to the HIP interface.
2 | 


--------------------------------------------------------------------------------
/vectorAdd/vectoradd_hip.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | #include <assert.h>
 23 | #include <stdio.h>
 24 | #include <algorithm>
 25 | #include <stdlib.h>
 26 | #include<iostream>
 27 | #include "hip/hip_runtime.h"
 28 | 
 29 | 
 30 | #ifdef NDEBUG
 31 | #define HIP_ASSERT(x) x
 32 | #else
 33 | #define HIP_ASSERT(x) (assert((x)==hipSuccess))
 34 | #endif
 35 | 
 36 | 
 37 | #define WIDTH     1024
 38 | #define HEIGHT    1024
 39 | 
 40 | #define NUM       (WIDTH*HEIGHT)
 41 | 
 42 | #define THREADS_PER_BLOCK_X  16
 43 | #define THREADS_PER_BLOCK_Y  16
 44 | #define THREADS_PER_BLOCK_Z  1
 45 | 
 46 | __global__ void 
 47 | vectoradd_float(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, int width, int height) 
 48 | 
 49 |   {
 50 |  
 51 |       int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 52 |       int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 53 | 
 54 |       int i = y * width + x;
 55 |       if ( i < (width * height)) {
 56 |         a[i] = b[i] + c[i];
 57 |       }
 58 | 
 59 | 
 60 | 
 61 |   }
 62 | 
 63 | #if 0
 64 | __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
 65 | 
 66 |   
 67 |   int x = blockDimX * blockIdx.x + threadIdx.x;
 68 |   int y = blockDimY * blockIdy.y + threadIdx.y;
 69 | 
 70 |   int i = y * width + x;
 71 |   if ( i < (width * height)) {
 72 |     a[i] = b[i] + c[i];
 73 |   }
 74 | }
 75 | #endif
 76 | 
 77 | using namespace std;
 78 | 
 79 | int main() {
 80 |   
 81 |   float* hostA;
 82 |   float* hostB;
 83 |   float* hostC;
 84 | 
 85 |   float* deviceA;
 86 |   float* deviceB;
 87 |   float* deviceC;
 88 | 
 89 |   hipDeviceProp_t devProp;
 90 |   hipGetDeviceProperties(&devProp, 0);
 91 |   cout << " System minor " << devProp.minor << endl;
 92 |   cout << " System major " << devProp.major << endl;
 93 |   cout << " agent prop name " << devProp.name << endl;
 94 | 
 95 | 
 96 | 
 97 |   cout << "hip Device prop succeeded " << endl ;
 98 | 
 99 | 
100 |   int i;
101 |   int errors;
102 | 
103 |   hostA = (float*)malloc(NUM * sizeof(float));
104 |   hostB = (float*)malloc(NUM * sizeof(float));
105 |   hostC = (float*)malloc(NUM * sizeof(float));
106 |   
107 |   // initialize the input data
108 |   for (i = 0; i < NUM; i++) {
109 |     hostB[i] = (float)i;
110 |     hostC[i] = (float)i*100.0f;
111 |   }
112 |   
113 |   HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(float)));
114 |   HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(float)));
115 |   HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(float)));
116 |   
117 |   HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(float), hipMemcpyHostToDevice));
118 |   HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(float), hipMemcpyHostToDevice));
119 | 
120 | 
121 |   hipLaunchKernelGGL(vectoradd_float, 
122 |                   dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
123 |                   dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
124 |                   0, 0,
125 |                   deviceA ,deviceB ,deviceC ,WIDTH ,HEIGHT);
126 | 
127 | 
128 |   HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(float), hipMemcpyDeviceToHost));
129 | 
130 |   // verify the results
131 |   errors = 0;
132 |   for (i = 0; i < NUM; i++) {
133 |     if (hostA[i] != (hostB[i] + hostC[i])) {
134 |       errors++;
135 |     }
136 |   }
137 |   if (errors!=0) {
138 |     printf("FAILED: %d errors\n",errors);
139 |   } else {
140 |       printf ("PASSED!\n");
141 |   }
142 | 
143 |   HIP_ASSERT(hipFree(deviceA));
144 |   HIP_ASSERT(hipFree(deviceB));
145 |   HIP_ASSERT(hipFree(deviceC));
146 | 
147 |   free(hostA);
148 |   free(hostB);
149 |   free(hostC);
150 | 
151 |   //hipResetDefaultAccelerator();
152 | 
153 |   return errors;
154 | }
155 | 


--------------------------------------------------------------------------------