├── .gitignore ├── MD ├── MD.cpp ├── MD.hpp └── CMakeLists.txt ├── SPMV ├── SPMV.h ├── SPMV.cpp └── CMakeLists.txt ├── HCFFT ├── FFT.cpp ├── FFT.hpp └── CMakeLists.txt ├── include └── SDKUtil │ ├── HCUtil.hpp │ └── SDKUtil.hpp ├── ArrayBandwidth ├── ArrayBandwidth.cpp ├── ArrayBandwidth.hpp └── CMakeLists.txt ├── SyncVsAsyncArrayCopy ├── SyncVsAsyncArrayCopy.cpp ├── SyncVsAsyncArrayCopy.hpp └── CMakeLists.txt ├── README.md ├── BitonicSort-CL-from-HCC ├── embed_hsaco.sh ├── Makefile ├── hsa_utils.hpp ├── README.md ├── BitonicSort_hcc.hpp ├── BitonicSort_Kernels.cl ├── hsa_utils.cpp └── BitonicSort_hcc.cpp └── CMakeLists.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | *.o 3 | *.exe 4 | *.swp 5 | *.Po 6 | *~ 7 | 8 | -------------------------------------------------------------------------------- /MD/MD.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/MD/MD.cpp -------------------------------------------------------------------------------- /MD/MD.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/MD/MD.hpp -------------------------------------------------------------------------------- /SPMV/SPMV.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/SPMV/SPMV.h -------------------------------------------------------------------------------- /HCFFT/FFT.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/HCFFT/FFT.cpp -------------------------------------------------------------------------------- /HCFFT/FFT.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/HCFFT/FFT.hpp -------------------------------------------------------------------------------- /SPMV/SPMV.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/SPMV/SPMV.cpp -------------------------------------------------------------------------------- /include/SDKUtil/HCUtil.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/include/SDKUtil/HCUtil.hpp -------------------------------------------------------------------------------- /include/SDKUtil/SDKUtil.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/include/SDKUtil/SDKUtil.hpp -------------------------------------------------------------------------------- /ArrayBandwidth/ArrayBandwidth.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/ArrayBandwidth/ArrayBandwidth.cpp -------------------------------------------------------------------------------- /ArrayBandwidth/ArrayBandwidth.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/ArrayBandwidth/ArrayBandwidth.hpp -------------------------------------------------------------------------------- /SyncVsAsyncArrayCopy/SyncVsAsyncArrayCopy.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/SyncVsAsyncArrayCopy/SyncVsAsyncArrayCopy.cpp -------------------------------------------------------------------------------- /SyncVsAsyncArrayCopy/SyncVsAsyncArrayCopy.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/HCC-Example-Application/HEAD/SyncVsAsyncArrayCopy/SyncVsAsyncArrayCopy.hpp -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Getting Started ## 3 | 4 | First, follow the instruction to install [ROCm](https://github.com/RadeonOpenCompute/ROCm), which provides the HCC compiler, the ROCr runtime and the toolchains to compile the samples. 5 | 6 | ### How to Build the Samples ### 7 | 8 | #### HC C++ samples #### 9 | 10 | 1. Make sure the HCC compiler is in your path. The default installation of HCC is /opt/rocm/bin. 11 | 2. In the HCC-Example-Applications directory, create a build directory. 12 | 3. Go into the build directory, then type `CXX=hcc cmake ..` to generate the makefiles. 13 | 4. Type `make` to compile the samples. 14 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/embed_hsaco.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This shell script converts an .hsaco file into an c++ file containing a "serialized" 3 | # version of the hsaco file which is accessible as a global string. 4 | # The serialized version contains the all of the sections defines in the hsaco file, 5 | # including the code and symbols. 6 | # The resulting .cpp file can be compiled with a C compiler (gcc) and linked into 7 | # an application, and the .hsaco file accessed with the global string. 8 | #usage embed_hsaco.sh INFILE OUTFILE HSACO_SYMBOL 9 | 10 | HSACO_INFILE=$1 11 | HSACO_OUTFILE=$2 12 | SYMBOLNAME=$(basename -s .hsaco $1) 13 | 14 | echo "#include " > $HSACO_OUTFILE 15 | echo "char _${SYMBOLNAME}_HSA_CodeObjMem[] = {" >> $HSACO_OUTFILE 16 | hexdump -v -e '"0x" 1/1 "%02X" ","' $HSACO_INFILE >> $HSACO_OUTFILE 17 | echo "};" >> $HSACO_OUTFILE 18 | echo "size_t _${SYMBOLNAME}_HSA_CodeObjMemSz = sizeof(_${SYMBOLNAME}_HSA_CodeObjMem);" >> $HSACO_OUTFILE 19 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/Makefile: -------------------------------------------------------------------------------- 1 | ROCM_PATH?=/opt/rocm 2 | 3 | OPT=-O3 4 | CXXFLAGS=-Wall 5 | 6 | HSACO= BitonicSort_Kernels.hsaco 7 | EMBEDDED_HSACO= BitonicSort_Kernels.hsaco.o 8 | 9 | TARGET0 = BitonicSort_hcc 10 | SOURCES0 = BitonicSort_hcc.cpp hsa_utils.cpp 11 | OBJECTS0=${SOURCES0:.cpp=.o} 12 | 13 | 14 | 15 | $(TARGET0): $(OBJECTS0) $(HSACO) $(EMBEDDED_HSACO) 16 | hcc `hcc-config --ldflags` -lhc_am $(OPT) -L${ROCM_PATH}/lib -lhsa-runtime64 $(OBJECTS0) $(EMBEDDED_HSACO) -o $@ 17 | 18 | clean: 19 | rm -rf *.o $(HSACO) $(HSACO).cpp 20 | rm -rf $(TARGET0) 21 | rm -rf $(TARGET1) 22 | 23 | .cpp.o: 24 | hcc `hcc-config --cxxflags` -I$(ROCM_PATH)/include -c $(CXXFLAGS) $(OPT) $< -o $@ 25 | 26 | 27 | %.hsaco : %.cl 28 | $(ROCM_PATH)/cloc/bin/cloc.sh $< -o $@ 29 | 30 | %.hsaco.o : %.hsaco 31 | ./embed_hsaco.sh $< $<.cpp BitonicSort 32 | gcc -c $<.cpp -o $@ 33 | 34 | 35 | 36 | 37 | 38 | BitonicSort_hcc.o: BitonicSort_hcc.cpp BitonicSort_hcc.hpp 39 | 40 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/hsa_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | #pragma once 23 | 24 | 25 | #include 26 | 27 | extern uint64_t load_hsa_code_object_from_file(const char *fileName, const char *kernelName, hsa_agent_t agent); 28 | extern uint64_t load_hsa_code_object(const char *raw_code_object, size_t code_object_size, const char *kernelName, hsa_agent_t agent); 29 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/README.md: -------------------------------------------------------------------------------- 1 | # Intro 2 | 3 | This project demonstrates how to call an OpenCL kernel from HCC, using the HSA runtime. 4 | The intent is to allow OpenCL kenels to be combined with HCC runtime, to allow 5 | existing OpenCL kernels to be leveraged and also to provide more developer options. 6 | 7 | The code also shows several optimizations that are bracked with "p_opt\*" variables and 8 | can be enabled with command-line switches. 9 | 10 | See associated blog here: http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program/ 11 | 12 | # Setup 13 | This example requires a ROCM installation + the CLOC (CL Offline Compiler) tool. 14 | https://github.com/HSAFoundation/CLOC 15 | 16 | 17 | 18 | # Run 19 | // Run baseline configuration 20 | $./BitonicSort_hcc 21 | 22 | // Run with all optimizatiosn enabled: 23 | $./BitonicSort_hcc --opt 24 | 25 | // Show help 26 | $ ./BitonicSort_hcc --help 27 | usage: BitonicSort [options] 28 | --opt Enable all optimizations. 29 | --optPreallocSignal Pre-allocate the signal in setup. 30 | --optPreallocKernarg Pre-allocate the kernarg in setup. 31 | --optAvoidHostSync Don't synchronize to host after each kernel launch 32 | --optFence Don't fence/flush after each kernel submission. 33 | --optPinnedHost Use pinned host memory for allocations. 34 | 35 | --printProgress Print progress messages for each stage. Will impact timing measurements. 36 | --useHcArray Use hc::array<> to allocate memory (default uses hc::am_alloc). 37 | --loadKernelFromFile Load HSACO from file (rather than use embedded HSACO string) 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/BitonicSort_hcc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | #include 23 | 24 | typedef std::chrono::high_resolution_clock myclock; 25 | 26 | class BitonicSort 27 | { 28 | public: 29 | BitonicSort(hc::accelerator acc); 30 | 31 | void setup(); 32 | myclock::duration run(); 33 | void verifyResults(); 34 | void cleanup(); 35 | 36 | private: 37 | void bitonicSortGPU(myclock::duration *d); 38 | void bitonicSortGPU_opt(myclock::duration *d); 39 | void bitonicSortCPUReference( uint32_t * input, const uint32_t length, const bool sortIncreasing); 40 | private: 41 | uint32_t _seed; 42 | uint32_t _sortIncreasing; 43 | uint32_t *_input; 44 | hc::array *_inputArray; 45 | uint32_t *_inputAccPtr; 46 | 47 | uint32_t *_verificationInput; 48 | int _length; 49 | int _numStages; 50 | 51 | int _iterations; 52 | 53 | // Which accelerator to launch the kernel on: 54 | hc::accelerator _acc; 55 | 56 | 57 | // HSA info for launching kernel: 58 | uint64_t _codeHandle; 59 | hsa_signal_t _signal; 60 | char * _kernargPointer; // base of kernarg region. 61 | 62 | friend void init_kernel(BitonicSort *bs, hc::accelerator acc); 63 | }; 64 | void init_kernel(BitonicSort *bs, hc::accelerator acc); 65 | 66 | 67 | BitonicSort::BitonicSort(hc::accelerator acc) 68 | :_inputArray(0), _acc(acc) 69 | { 70 | _seed = 123; 71 | _sortIncreasing = 0; 72 | _input = NULL; 73 | _verificationInput = NULL; 74 | _length = 32768; 75 | _numStages = 0; 76 | _iterations = 1; // TODO 77 | _signal.handle = -1; 78 | _kernargPointer = NULL; 79 | } 80 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | # 6 | # • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | # • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 8 | # other materials provided with the distribution. 9 | # 10 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 11 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 12 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 13 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | ################################################################################# 16 | 17 | 18 | set( FOLDER_NAME C++Amp ) 19 | set( SUBDIRECTORIES ArrayBandwidth 20 | HCFFT 21 | MD 22 | SPMV 23 | SyncVsAsyncArrayCopy ) 24 | 25 | ############################################################################ 26 | 27 | set(CMAKE_SUPPRESS_REGENERATION TRUE) 28 | cmake_minimum_required( VERSION 2.6.0 ) 29 | project( ${FOLDER_NAME} ) 30 | 31 | # Group samples by folder 32 | set_property(GLOBAL PROPERTY USE_FOLDERS ON) 33 | set( FOLDER_GROUP ${FOLDER_GROUP}/${FOLDER_NAME} ) 34 | 35 | if( MSVC) 36 | set(SUBDIRECTORIES ${SUBDIRECTORIES} ${SUBDIRECTORIES_WIN}) 37 | elseif (UNIX) 38 | include_directories( include/SDKUtil ) 39 | 40 | # set the compile options for Kalmar 41 | execute_process(COMMAND hcc-config --cxxflags OUTPUT_VARIABLE KALMAR_COMPILE_FLAGS) 42 | set( COMPILER_FLAGS "${COMPILER_FLAGS} ${KALMAR_COMPILE_FLAGS}" ) 43 | 44 | set( COMPILER_FLAGS "${COMPILER_FLAGS} -mcmodel=small" ) 45 | 46 | # set the link options for Kalmar 47 | execute_process(COMMAND hcc-config --ldflags OUTPUT_VARIABLE KALMAR_LINKER_FLAGS) 48 | set( LINKER_FLAGS "${LINKER_FLAGS} ${KALMAR_LINKER_FLAGS}") 49 | 50 | set( LINKER_FLAGS "${LINKER_FLAGS} -mcmodel=small -lm") 51 | 52 | # Set output directory to bin 53 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin/${BITNESS_SUFFIX}) 54 | 55 | endif() 56 | 57 | # Auto-select bitness based on platform 58 | if( NOT BITNESS ) 59 | if (CMAKE_SIZEOF_VOID_P EQUAL 8) 60 | set(BITNESS 64) 61 | set(BITNESS_SUFFIX x86_64) 62 | else() 63 | set(BITNESS 32) 64 | set(BITNESS_SUFFIX x86) 65 | endif() 66 | endif() 67 | 68 | # Set output directory to bin 69 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin/${BITNESS_SUFFIX}) 70 | 71 | set( SUBDIRECTORIES_WIN "") 72 | foreach( subdir ${SUBDIRECTORIES} ) 73 | add_subdirectory( ${subdir} ) 74 | endforeach( subdir ) 75 | 76 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/BitonicSort_Kernels.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | /* 24 | * For a description of the algorithm and the terms used, please see the 25 | * documentation for this sample. 26 | * 27 | * One invocation of this kernel, i.e one work thread writes two output values. 28 | * Since every pass of this algorithm does width/2 comparisons, each compare 29 | * operation is done by one work thread. 30 | * 31 | * Depending of the direction of sort for the work thread, the output values 32 | * are written either as greater value to left element or lesser value to the 33 | * left element. Right element and left element are the two elements we are 34 | * comparing and "left" is the element with a smaller index into the array. 35 | * 36 | * if direction is CL_TRUE, i.e evaluates to non zero, it means "increasing". 37 | * 38 | * For an explanation of the terms "blockWidth", "sameDirectionBlockWidth", 39 | * stage, pass, pairDistance please go through the document shipped with this 40 | * sample. 41 | * 42 | * Since an explanation of the terms and the code here would be quite lengthy, 43 | * confusing and will greatly reduce the readability of this kernel, the code 44 | * has been explained in detail in the document mentioned above. 45 | */ 46 | 47 | __kernel 48 | void bitonicSort(__global uint * theArray, 49 | const uint stage, 50 | const uint passOfStage, 51 | const uint direction) 52 | { 53 | uint sortIncreasing = direction; 54 | uint threadId = get_global_id(0); 55 | 56 | uint pairDistance = 1 << (stage - passOfStage); 57 | uint blockWidth = 2 * pairDistance; 58 | 59 | uint leftId = (threadId % pairDistance) 60 | + (threadId / pairDistance) * blockWidth; 61 | 62 | uint rightId = leftId + pairDistance; 63 | 64 | uint leftElement = theArray[leftId]; 65 | uint rightElement = theArray[rightId]; 66 | 67 | uint sameDirectionBlockWidth = 1 << stage; 68 | 69 | if((threadId/sameDirectionBlockWidth) % 2 == 1) 70 | sortIncreasing = 1 - sortIncreasing; 71 | 72 | uint greater; 73 | uint lesser; 74 | if(leftElement > rightElement) 75 | { 76 | greater = leftElement; 77 | lesser = rightElement; 78 | } 79 | else 80 | { 81 | greater = rightElement; 82 | lesser = leftElement; 83 | } 84 | 85 | if(sortIncreasing) 86 | { 87 | theArray[leftId] = lesser; 88 | theArray[rightId] = greater; 89 | } 90 | else 91 | { 92 | theArray[leftId] = greater; 93 | theArray[rightId] = lesser; 94 | } 95 | } 96 | 97 | 98 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/hsa_utils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "hsa_utils.hpp" 30 | 31 | uint64_t 32 | load_hsa_code_object(const char *raw_code_object, size_t code_object_size, const char *kernelName, hsa_agent_t agent) 33 | { 34 | hsa_status_t hsa_status = HSA_STATUS_SUCCESS; 35 | 36 | // Deserialize code object. 37 | hsa_code_object_t code_object = {0}; 38 | hsa_status = hsa_code_object_deserialize((void*)raw_code_object, code_object_size, NULL, &code_object); 39 | assert(HSA_STATUS_SUCCESS == hsa_status); 40 | assert(0 != code_object.handle); 41 | 42 | // Create executable. 43 | hsa_executable_t hsaExecutable; 44 | hsa_status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &hsaExecutable); 45 | assert(HSA_STATUS_SUCCESS == hsa_status); 46 | 47 | // Load code object. 48 | hsa_status = hsa_executable_load_code_object(hsaExecutable, agent, code_object, NULL); 49 | assert(HSA_STATUS_SUCCESS == hsa_status); 50 | 51 | // Freeze executable. 52 | hsa_status = hsa_executable_freeze(hsaExecutable, NULL); 53 | assert(HSA_STATUS_SUCCESS == hsa_status); 54 | 55 | // Get symbol handle. 56 | hsa_executable_symbol_t kernelSymbol; 57 | hsa_status = hsa_executable_get_symbol(hsaExecutable, NULL, kernelName, agent, 0, &kernelSymbol); 58 | assert(HSA_STATUS_SUCCESS == hsa_status); 59 | 60 | // Get code handle. 61 | uint64_t codeHandle; 62 | hsa_status = hsa_executable_symbol_get_info(kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &codeHandle); 63 | assert(HSA_STATUS_SUCCESS == hsa_status); 64 | 65 | return codeHandle; 66 | } 67 | 68 | 69 | 70 | /* 71 | * Extract the code handle for the specified kernelName from the specified fileName 72 | * Returns a 64-bit code object which can be used with an AQL packet 73 | */ 74 | uint64_t 75 | load_hsa_code_object_from_file(const char *fileName, const char *kernelName, hsa_agent_t agent) 76 | { 77 | // Open file. 78 | std::ifstream file(fileName, std::ios::in | std::ios::binary); 79 | assert(file.is_open() && file.good()); 80 | 81 | // Find out file size. 82 | file.seekg(0, file.end); 83 | size_t size = file.tellg(); 84 | file.seekg(0, file.beg); 85 | 86 | // Allocate memory for raw code object. 87 | char *raw_code_object = (char*)malloc(size); 88 | assert(raw_code_object); 89 | 90 | // Read file contents. 91 | file.read(raw_code_object, size); 92 | 93 | // Close file. 94 | file.close(); 95 | 96 | uint64_t codeHandle = load_hsa_code_object(raw_code_object, size, kernelName, agent); 97 | 98 | // Free raw code object memory. 99 | free((void*)raw_code_object); 100 | 101 | return codeHandle; 102 | }; 103 | 104 | 105 | -------------------------------------------------------------------------------- /HCFFT/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | # 6 | # • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | # • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 8 | # other materials provided with the distribution. 9 | # 10 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 11 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 12 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 13 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | ################################################################################# 16 | 17 | set( SAMPLE_NAME HCFFT ) 18 | set( SOURCE_FILES FFT.cpp ) 19 | 20 | ############################################################################ 21 | 22 | set(CMAKE_SUPPRESS_REGENERATION TRUE) 23 | cmake_minimum_required( VERSION 2.6.0 ) 24 | project( ${SAMPLE_NAME} ) 25 | 26 | file(GLOB INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h" ) 27 | 28 | add_executable( ${SAMPLE_NAME} ${SOURCE_FILES} ${INCLUDE_FILES} ${EXTRA_FILES}) 29 | 30 | if (WIN32) 31 | include_directories( ../../include/SDKUtil $ENV{AMDAPPSDKROOT}/include/SDKUtil ) 32 | 33 | # Samples can specify additional libs/flags using EXTRA* defines 34 | add_definitions( "/W3 /D_CRT_SECURE_NO_WARNINGS /wd4005 /wd4996 /nologo" ) 35 | if(${SAMPLE_NAME} MATCHES "HCFFT") 36 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/Od") 37 | string(REPLACE "/Od" "/O1" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 38 | endif() 39 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 40 | string(REPLACE "/Od" "O1" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 41 | endif() 42 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/RTC1") 43 | string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 44 | endif() 45 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 46 | string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 47 | endif() 48 | endif() 49 | set( COMPILER_FLAGS "${COMPILER_FLAGS} ${EXTRA_COMPILER_FLAGS_MSVC} " ) 50 | set( LINKER_FLAGS "${LINKER_FLAGS} ${EXTRA_LINKER_FLAGS_MSVC} " ) 51 | set( ADDITIONAL_LIBRARIES ${ADDITIONAL_LIBRARIES} ${EXTRA_LIBRARIES_MSVC} ) 52 | 53 | endif() 54 | 55 | set_target_properties( ${SAMPLE_NAME} PROPERTIES 56 | COMPILE_FLAGS ${COMPILER_FLAGS} 57 | LINK_FLAGS ${LINKER_FLAGS} 58 | ) 59 | target_link_libraries( ${SAMPLE_NAME} ${ADDITIONAL_LIBRARIES} ) 60 | 61 | # Copy extra files to binary directory 62 | foreach( extra_file ${EXTRA_FILES} ) 63 | add_custom_command( 64 | TARGET ${SAMPLE_NAME} POST_BUILD 65 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 66 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ${EXECUTABLE_OUTPUT_PATH}/${CMAKE_CFG_INTDIR} 67 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 68 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ./ 69 | ) 70 | endforeach( extra_file ) 71 | 72 | # Group sample based on FOLDER_GROUP defined in parent folder 73 | if( FOLDER_GROUP ) 74 | set_target_properties(${SAMPLE_NAME} PROPERTIES FOLDER ${FOLDER_GROUP}) 75 | endif( ) 76 | -------------------------------------------------------------------------------- /MD/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | # 6 | # • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | # • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 8 | # other materials provided with the distribution. 9 | # 10 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 11 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 12 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 13 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | ################################################################################# 16 | 17 | 18 | set( SAMPLE_NAME MD ) 19 | set( SOURCE_FILES MD.cpp ) 20 | 21 | ############################################################################ 22 | 23 | set(CMAKE_SUPPRESS_REGENERATION TRUE) 24 | cmake_minimum_required( VERSION 2.6.0 ) 25 | project( ${SAMPLE_NAME} ) 26 | 27 | file(GLOB INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h" ) 28 | 29 | add_executable( ${SAMPLE_NAME} ${SOURCE_FILES} ${INCLUDE_FILES} ${EXTRA_FILES}) 30 | 31 | if (WIN32) 32 | include_directories( ../../include/SDKUtil $ENV{AMDAPPSDKROOT}/include/SDKUtil ) 33 | 34 | # Samples can specify additional libs/flags using EXTRA* defines 35 | add_definitions( "/W3 /D_CRT_SECURE_NO_WARNINGS /wd4005 /wd4996 /nologo" ) 36 | if(${SAMPLE_NAME} MATCHES "HCFFT") 37 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/Od") 38 | string(REPLACE "/Od" "/O1" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 39 | endif() 40 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 41 | string(REPLACE "/Od" "O1" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 42 | endif() 43 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/RTC1") 44 | string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 45 | endif() 46 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 47 | string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 48 | endif() 49 | endif() 50 | set( COMPILER_FLAGS "${COMPILER_FLAGS} ${EXTRA_COMPILER_FLAGS_MSVC} " ) 51 | set( LINKER_FLAGS "${LINKER_FLAGS} ${EXTRA_LINKER_FLAGS_MSVC} " ) 52 | set( ADDITIONAL_LIBRARIES ${ADDITIONAL_LIBRARIES} ${EXTRA_LIBRARIES_MSVC} ) 53 | 54 | endif() 55 | 56 | set_target_properties( ${SAMPLE_NAME} PROPERTIES 57 | COMPILE_FLAGS ${COMPILER_FLAGS} 58 | LINK_FLAGS ${LINKER_FLAGS} 59 | ) 60 | target_link_libraries( ${SAMPLE_NAME} ${ADDITIONAL_LIBRARIES} ) 61 | 62 | # Copy extra files to binary directory 63 | foreach( extra_file ${EXTRA_FILES} ) 64 | add_custom_command( 65 | TARGET ${SAMPLE_NAME} POST_BUILD 66 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 67 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ${EXECUTABLE_OUTPUT_PATH}/${CMAKE_CFG_INTDIR} 68 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 69 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ./ 70 | ) 71 | endforeach( extra_file ) 72 | 73 | # Group sample based on FOLDER_GROUP defined in parent folder 74 | if( FOLDER_GROUP ) 75 | set_target_properties(${SAMPLE_NAME} PROPERTIES FOLDER ${FOLDER_GROUP}) 76 | endif( ) 77 | -------------------------------------------------------------------------------- /SPMV/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | # 6 | # • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | # • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 8 | # other materials provided with the distribution. 9 | # 10 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 11 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 12 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 13 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | ################################################################################# 16 | 17 | set( SAMPLE_NAME SPMV ) 18 | set( SOURCE_FILES SPMV.cpp ) 19 | 20 | ############################################################################ 21 | 22 | set(CMAKE_SUPPRESS_REGENERATION TRUE) 23 | cmake_minimum_required( VERSION 2.6.0 ) 24 | project( ${SAMPLE_NAME} ) 25 | 26 | file(GLOB INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h" ) 27 | 28 | add_executable( ${SAMPLE_NAME} ${SOURCE_FILES} ${INCLUDE_FILES} ${EXTRA_FILES}) 29 | 30 | if (WIN32) 31 | include_directories( ../../include/SDKUtil $ENV{AMDAPPSDKROOT}/include/SDKUtil ) 32 | 33 | # Samples can specify additional libs/flags using EXTRA* defines 34 | add_definitions( "/W3 /D_CRT_SECURE_NO_WARNINGS /wd4005 /wd4996 /nologo" ) 35 | if(${SAMPLE_NAME} MATCHES "HCFFT") 36 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/Od") 37 | string(REPLACE "/Od" "/O1" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 38 | endif() 39 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 40 | string(REPLACE "/Od" "O1" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 41 | endif() 42 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/RTC1") 43 | string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 44 | endif() 45 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 46 | string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 47 | endif() 48 | endif() 49 | set( COMPILER_FLAGS "${COMPILER_FLAGS} ${EXTRA_COMPILER_FLAGS_MSVC} " ) 50 | set( LINKER_FLAGS "${LINKER_FLAGS} ${EXTRA_LINKER_FLAGS_MSVC} " ) 51 | set( ADDITIONAL_LIBRARIES ${ADDITIONAL_LIBRARIES} ${EXTRA_LIBRARIES_MSVC} ) 52 | 53 | endif() 54 | 55 | set_target_properties( ${SAMPLE_NAME} PROPERTIES 56 | COMPILE_FLAGS ${COMPILER_FLAGS} 57 | LINK_FLAGS ${LINKER_FLAGS} 58 | ) 59 | target_link_libraries( ${SAMPLE_NAME} ${ADDITIONAL_LIBRARIES} ) 60 | 61 | # Copy extra files to binary directory 62 | foreach( extra_file ${EXTRA_FILES} ) 63 | add_custom_command( 64 | TARGET ${SAMPLE_NAME} POST_BUILD 65 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 66 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ${EXECUTABLE_OUTPUT_PATH}/${CMAKE_CFG_INTDIR} 67 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 68 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ./ 69 | ) 70 | endforeach( extra_file ) 71 | 72 | # Group sample based on FOLDER_GROUP defined in parent folder 73 | if( FOLDER_GROUP ) 74 | set_target_properties(${SAMPLE_NAME} PROPERTIES FOLDER ${FOLDER_GROUP}) 75 | endif( ) 76 | -------------------------------------------------------------------------------- /ArrayBandwidth/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | # 6 | # • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | # • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 8 | # other materials provided with the distribution. 9 | # 10 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 11 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 12 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 13 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | ################################################################################# 16 | 17 | set( SAMPLE_NAME ArrayBandwidth ) 18 | set( SOURCE_FILES ArrayBandwidth.cpp ) 19 | 20 | ############################################################################ 21 | 22 | set(CMAKE_SUPPRESS_REGENERATION TRUE) 23 | cmake_minimum_required( VERSION 2.6.0 ) 24 | project( ${SAMPLE_NAME} ) 25 | 26 | file(GLOB INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h" ) 27 | 28 | add_executable( ${SAMPLE_NAME} ${SOURCE_FILES} ${INCLUDE_FILES} ${EXTRA_FILES}) 29 | 30 | if (WIN32) 31 | include_directories( ../../include/SDKUtil $ENV{AMDAPPSDKROOT}/include/SDKUtil ) 32 | 33 | # Samples can specify additional libs/flags using EXTRA* defines 34 | add_definitions( "/W3 /D_CRT_SECURE_NO_WARNINGS /wd4005 /wd4996 /nologo" ) 35 | if(${SAMPLE_NAME} MATCHES "HCFFT") 36 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/Od") 37 | string(REPLACE "/Od" "/O1" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 38 | endif() 39 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 40 | string(REPLACE "/Od" "O1" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 41 | endif() 42 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/RTC1") 43 | string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 44 | endif() 45 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 46 | string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 47 | endif() 48 | endif() 49 | set( COMPILER_FLAGS "${COMPILER_FLAGS} ${EXTRA_COMPILER_FLAGS_MSVC} " ) 50 | set( LINKER_FLAGS "${LINKER_FLAGS} ${EXTRA_LINKER_FLAGS_MSVC} " ) 51 | set( ADDITIONAL_LIBRARIES ${ADDITIONAL_LIBRARIES} ${EXTRA_LIBRARIES_MSVC} ) 52 | 53 | endif() 54 | 55 | set_target_properties( ${SAMPLE_NAME} PROPERTIES 56 | COMPILE_FLAGS ${COMPILER_FLAGS} 57 | LINK_FLAGS ${LINKER_FLAGS} 58 | ) 59 | target_link_libraries( ${SAMPLE_NAME} ${ADDITIONAL_LIBRARIES} ) 60 | 61 | # Copy extra files to binary directory 62 | foreach( extra_file ${EXTRA_FILES} ) 63 | add_custom_command( 64 | TARGET ${SAMPLE_NAME} POST_BUILD 65 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 66 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ${EXECUTABLE_OUTPUT_PATH}/${CMAKE_CFG_INTDIR} 67 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 68 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ./ 69 | ) 70 | endforeach( extra_file ) 71 | 72 | # Group sample based on FOLDER_GROUP defined in parent folder 73 | if( FOLDER_GROUP ) 74 | set_target_properties(${SAMPLE_NAME} PROPERTIES FOLDER ${FOLDER_GROUP}) 75 | endif( ) 76 | -------------------------------------------------------------------------------- /SyncVsAsyncArrayCopy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | # 6 | # • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | # • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 8 | # other materials provided with the distribution. 9 | # 10 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 11 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 12 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 13 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | ################################################################################# 16 | 17 | set( SAMPLE_NAME SyncVsAsyncArrayCopy ) 18 | set( SOURCE_FILES SyncVsAsyncArrayCopy.cpp ) 19 | 20 | ############################################################################ 21 | 22 | set(CMAKE_SUPPRESS_REGENERATION TRUE) 23 | cmake_minimum_required( VERSION 2.6.0 ) 24 | project( ${SAMPLE_NAME} ) 25 | 26 | file(GLOB INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h" ) 27 | 28 | add_executable( ${SAMPLE_NAME} ${SOURCE_FILES} ${INCLUDE_FILES} ${EXTRA_FILES}) 29 | 30 | if (WIN32) 31 | include_directories( ../../include/SDKUtil $ENV{AMDAPPSDKROOT}/include/SDKUtil ) 32 | 33 | # Samples can specify additional libs/flags using EXTRA* defines 34 | add_definitions( "/W3 /D_CRT_SECURE_NO_WARNINGS /wd4005 /wd4996 /nologo" ) 35 | if(${SAMPLE_NAME} MATCHES "HCFFT") 36 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/Od") 37 | string(REPLACE "/Od" "/O1" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 38 | endif() 39 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 40 | string(REPLACE "/Od" "O1" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 41 | endif() 42 | if(CMAKE_CXX_FLAGS_DEBUG MATCHES "/RTC1") 43 | string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") 44 | endif() 45 | if(CMAKE_C_FLAGS_DEBUG MATCHES "/RTC1") 46 | string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 47 | endif() 48 | endif() 49 | set( COMPILER_FLAGS "${COMPILER_FLAGS} ${EXTRA_COMPILER_FLAGS_MSVC} " ) 50 | set( LINKER_FLAGS "${LINKER_FLAGS} ${EXTRA_LINKER_FLAGS_MSVC} " ) 51 | set( ADDITIONAL_LIBRARIES ${ADDITIONAL_LIBRARIES} ${EXTRA_LIBRARIES_MSVC} ) 52 | 53 | endif() 54 | 55 | set_target_properties( ${SAMPLE_NAME} PROPERTIES 56 | COMPILE_FLAGS ${COMPILER_FLAGS} 57 | LINK_FLAGS ${LINKER_FLAGS} 58 | ) 59 | target_link_libraries( ${SAMPLE_NAME} ${ADDITIONAL_LIBRARIES} ) 60 | 61 | # Copy extra files to binary directory 62 | foreach( extra_file ${EXTRA_FILES} ) 63 | add_custom_command( 64 | TARGET ${SAMPLE_NAME} POST_BUILD 65 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 66 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ${EXECUTABLE_OUTPUT_PATH}/${CMAKE_CFG_INTDIR} 67 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 68 | ${CMAKE_CURRENT_SOURCE_DIR}/${extra_file} ./ 69 | ) 70 | endforeach( extra_file ) 71 | 72 | # Group sample based on FOLDER_GROUP defined in parent folder 73 | if( FOLDER_GROUP ) 74 | set_target_properties(${SAMPLE_NAME} PROPERTIES FOLDER ${FOLDER_GROUP}) 75 | endif( ) 76 | -------------------------------------------------------------------------------- /BitonicSort-CL-from-HCC/BitonicSort_hcc.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #include "hsa_utils.hpp" 32 | #include "BitonicSort_hcc.hpp" 33 | 34 | 35 | bool p_verify = true; // Run a CPU version and verify results. 36 | bool p_printArray = false; // Print array. 37 | bool p_printProgress = false; // print progress message at each kernel launch. 38 | 39 | // This program uses the "hc" C++ runtime to manage memory. 40 | // Use hc::array<> if true or use am_alloc pointer if false. 41 | bool p_useHcArray = false; 42 | 43 | 44 | int p_iterations = 10; // How many iterations to run. 45 | 46 | 47 | // The kernel is stored in HSACO (HSA Code Object) form. 48 | // The sample demonstrates two ways to load the hsaco file: 49 | // 1. from an exernal file using load_hsa_code_object_from_file. In this case, 50 | // the makefile runs cloc to generate the hsaco file, and the app loads this 51 | // file into memory using file I/O commands. 52 | // 53 | // 2. from a serialized internal string. In this case the Makefile serializes 54 | // the hsaco into a global string which is then linked into the executable, and 55 | // can be accessed via a global string. 56 | // 57 | // Most applications use the second approach since it eliminates the need to locate the 58 | // external hsaco file at runtime. 59 | bool p_loadKernelFromFile = false; 60 | 61 | 62 | /* 63 | * Optimizations : 64 | */ 65 | bool p_optPreallocSignal = false; // pre-allocate the signal in setup. 66 | bool p_optPreallocKernarg = false; // pre-allocate the kernarg in setup. 67 | bool p_optAvoidHostSync = false; // Don't synchronize to host after each kernel launch 68 | bool p_optFence = false; // Don't fence/flush after each kernel submission. 69 | bool p_optPinnedHost = false; // Use pinned host memory for allocations. 70 | 71 | 72 | // Filename for .hsaco file, only used if p_loadKernelFromFile=true. 73 | #define HSACO_FILENAME "BitonicSort_Kernels.hsaco" 74 | 75 | // Global symbols for embedded BitonicSort - only used if p_loadKernelFromFile=false 76 | extern char _BitonicSort_Kernels_HSA_CodeObjMem[]; 77 | extern size_t _BitonicSort_Kernels_HSA_CodeObjMemSz; 78 | 79 | // Name of kernel stored in the HSACO file: 80 | #define HSACO_KERNELNAME "bitonicSort" 81 | 82 | 83 | template 84 | void printArray( 85 | const std::string header, 86 | const T * data, 87 | const int width, 88 | const int height) 89 | { 90 | std::cout<<"\n"<, 139 | // which is a typed class representing device memory. 140 | 141 | _inputArray = new hc::array (_length); 142 | _inputAccPtr = _inputArray->accelerator_pointer(); 143 | 144 | 145 | if (p_printProgress) { 146 | printf ("info: allocated hc::array<>, size=%zu, accelerator_pointer=%p\n", sizeBytes, _inputAccPtr); 147 | } 148 | } else { 149 | // Allocate the array to sort using am_alloc, which returns a pointer. 150 | _inputAccPtr = hc::am_alloc(sizeBytes, _acc, 0); 151 | 152 | if (p_printProgress) { 153 | printf ("info: allocated hc::am_alloc, size=%zu, accelerator_pointer=%p\n", sizeBytes, _inputAccPtr); 154 | } 155 | } 156 | 157 | 158 | if(p_verify) { 159 | _verificationInput = (uint32_t *) malloc(sizeBytes); 160 | assert(_verificationInput); 161 | 162 | memcpy(_verificationInput, _input, sizeBytes); 163 | } 164 | 165 | 166 | if (p_printArray) { 167 | printArray( "Unsorted Input", _input, _length, 1); 168 | } 169 | 170 | 171 | // Load kernel from file: 172 | hsa_agent_t *agent = static_cast (_acc.get_hsa_agent()); 173 | assert(agent); 174 | 175 | if (p_loadKernelFromFile) { 176 | _codeHandle = load_hsa_code_object_from_file(HSACO_FILENAME, HSACO_KERNELNAME, *agent); 177 | } else { 178 | _codeHandle = load_hsa_code_object(_BitonicSort_Kernels_HSA_CodeObjMem, 179 | _BitonicSort_Kernels_HSA_CodeObjMemSz, 180 | HSACO_KERNELNAME, *agent); 181 | } 182 | 183 | /* 184 | * Determine how many stages and kernels we will need: 185 | */ 186 | _numStages = 0; 187 | for(int temp = _length; temp > 1; temp >>= 1) 188 | { 189 | ++_numStages; 190 | } 191 | 192 | /* Signal creation involves a call into the kernel driver and can be an expensive 193 | * operation. This code creates the signal once at init time and then re-uses it for 194 | * each kernel dispatch. 195 | */ 196 | if (p_optPreallocSignal) { 197 | hsa_status_t hsa_status = hsa_signal_create(1, 0, NULL, &_signal); 198 | assert(HSA_STATUS_SUCCESS == hsa_status); 199 | } 200 | 201 | 202 | /* 203 | * pre-allocate the kernarg buffer to remove a moderately expensive operation from inside the kernel dispatch loop. 204 | * Also since we will be have many kernels in-flight at same time, each kernel needs its own kernarg buffer. 205 | */ 206 | if (p_optPreallocKernarg) { 207 | /* Compute number of kernels that will be launched. To count: 208 | * - pair the first stage (1 kernel) and last stages (_numStages) == _numStages + 1 209 | * - pair the second stage (2 kernels) and the second-to-last stage (_numStages-1) == _numStages + 1 210 | * - pair the third stage (3 kernels) and the third-to-last stage (_numStages-2) == _numStages + 1 211 | * - and so on, creating _numStages/2 pairs each with _numStages+1 kernels: 212 | */ 213 | int numKernels = _numStages * (_numStages + 1) / 2; 214 | 215 | /* 216 | * Allocate the kernel argument buffer from the correct region. 217 | * We use HCC's handy get_hsa_kernarg_region accessor: 218 | */ 219 | int alignedArgSize = (sizeof(BitonicSort_args_t) + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE * CACHE_LINE_SIZE; 220 | 221 | if (p_printProgress) { 222 | printf ("numStages=%d, numKernels=%d, alignedArgSize=%d\n", _numStages, numKernels, alignedArgSize); 223 | } 224 | 225 | hsa_region_t kernarg_region = *(static_cast (_acc.get_hsa_kernarg_region())); 226 | hsa_status_t hsa_status = hsa_memory_allocate(kernarg_region, alignedArgSize * numKernels, (void**)(&_kernargPointer)); 227 | assert(HSA_STATUS_SUCCESS == hsa_status); 228 | } 229 | } 230 | 231 | 232 | /* 233 | * Use the HSA Runtime API to run the BitonicSort kernel. 234 | * 235 | * Input: Must be called after setup. 236 | * Notably _codeHandle must point to the BitonicSort kernel loaded from the .hsaco file. 237 | */ 238 | void BitonicSort::bitonicSortGPU(myclock::duration *d) 239 | { 240 | size_t sizeBytes = _length * sizeof(uint32_t); 241 | 242 | hc::am_copy(_inputAccPtr, _input, sizeBytes); 243 | 244 | /* 245 | * Extract the hsaQueue from the HCC acclerator_view: 246 | */ 247 | hc::accelerator_view av = _acc.get_default_view(); 248 | hsa_queue_t *hsaQueue = static_cast (av.get_hsa_queue()); 249 | 250 | hsa_status_t hsa_status = HSA_STATUS_SUCCESS; 251 | 252 | 253 | /* 254 | * Get a signal 255 | */ 256 | hsa_signal_t signal; 257 | if (p_optPreallocSignal) { 258 | signal = _signal; 259 | hsa_signal_store_relaxed(signal, 1); 260 | } else { 261 | hsa_status = hsa_signal_create(1, 0, NULL, &signal); 262 | assert(HSA_STATUS_SUCCESS == hsa_status); 263 | } 264 | 265 | 266 | /* 267 | * Setup dispatch packet. 268 | */ 269 | hsa_kernel_dispatch_packet_t aql; 270 | memset(&aql, 0, sizeof(aql)); 271 | 272 | const int kNumDimension = 1; 273 | aql.completion_signal = signal; 274 | aql.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 275 | aql.workgroup_size_x = (uint16_t) 256; 276 | aql.workgroup_size_y = 1; 277 | aql.workgroup_size_z = 1; 278 | aql.grid_size_x = (uint32_t) (_length / 2); 279 | aql.grid_size_y = 1; 280 | aql.grid_size_z = 1; 281 | aql.header = 282 | (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 283 | (1 << HSA_PACKET_HEADER_BARRIER) | 284 | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | 285 | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); 286 | aql.group_segment_size = 0; 287 | aql.private_segment_size = 0; 288 | aql.kernel_object = _codeHandle; // set at initialization time. 289 | 290 | 291 | 292 | if (p_optPreallocKernarg) { 293 | aql.kernarg_address = _kernargPointer; 294 | } else { 295 | /* 296 | * Allocate the kernel argument buffer from the correct region. 297 | * We use HCC's handy get_hsa_kernarg_region accessor: 298 | */ 299 | hsa_region_t kernarg_region = *(static_cast (_acc.get_hsa_kernarg_region())); 300 | hsa_status = hsa_memory_allocate(kernarg_region, sizeof(BitonicSort_args_t), (void**)(&aql.kernarg_address)); 301 | assert(HSA_STATUS_SUCCESS == hsa_status); 302 | 303 | } 304 | 305 | 306 | /* 307 | * Write the args directly into the kernargs buffer: 308 | * Typecast the kernarg pointer to BitonicSort_args_t so arg-setting code can use the structure fields: 309 | */ 310 | BitonicSort_args_t * args = (BitonicSort_args_t*) (aql.kernarg_address); 311 | args->theArray = _inputAccPtr; 312 | args->stage = 0; 313 | args->passOfStage = 0; 314 | args->direction = _sortIncreasing; 315 | 316 | 317 | const uint32_t queueSize = hsaQueue->size; 318 | const uint32_t queueMask = queueSize - 1; 319 | 320 | 321 | auto kernel_start_time = myclock::now(); 322 | 323 | for(int stage = 0; stage < _numStages; ++stage) { 324 | args->stage = stage; 325 | for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) { 326 | args->passOfStage = passOfStage; 327 | 328 | if (p_printProgress) { 329 | printf (" launching kernel for stage=%d pass=%d\n", stage, passOfStage); 330 | } 331 | 332 | // Write AQL packet to queue to launch the kernel: 333 | { 334 | uint64_t writeIndex = hsa_queue_load_write_index_relaxed(hsaQueue); 335 | uint64_t readIndex = hsa_queue_load_read_index_relaxed(hsaQueue); 336 | 337 | if ((writeIndex - readIndex) != queueMask) { 338 | ((hsa_kernel_dispatch_packet_t*)(hsaQueue->base_address))[writeIndex & queueMask] = aql; 339 | 340 | hsa_queue_store_write_index_relaxed(hsaQueue, writeIndex + 1); 341 | 342 | // Ringdoor bell. 343 | hsa_signal_store_relaxed(hsaQueue->doorbell_signal, writeIndex); 344 | 345 | if (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), 346 | HSA_WAIT_STATE_ACTIVE) != 0) { 347 | printf("Signal wait returned unexpected value\n"); 348 | assert(0); 349 | } 350 | 351 | hsa_signal_store_relaxed(signal, 1); 352 | } else { 353 | printf ("Error - queue full!\n"); 354 | assert(0); 355 | } 356 | } 357 | } 358 | }; 359 | 360 | auto kernel_end_time = myclock::now(); 361 | *d = (kernel_end_time - kernel_start_time); 362 | 363 | if (!p_optPreallocSignal) { 364 | hsa_signal_destroy(signal); 365 | } 366 | if (!p_optPreallocKernarg) { 367 | hsa_memory_free(aql.kernarg_address); 368 | } 369 | 370 | hc::am_copy(_input, _inputAccPtr, sizeBytes); 371 | } 372 | 373 | /* 374 | * Use the HSA Runtime API to run the BitonicSort kernel. 375 | * 376 | * Input: Must be called after setup. 377 | * Notably _codeHandle must point to the BitonicSort kernel loaded from the .hsaco file. 378 | * 379 | * This version is optimized to avoid host-side waiting between each kernel. 380 | * All kernels are launched up-front and the dependencies are resolved on the GPU. 381 | * Also, we optimize the AQL acquire/release fences since we know the data does not need to be visible 382 | * at the system scope - this can eliminate cache flushing between kernels. 383 | */ 384 | void BitonicSort::bitonicSortGPU_opt(myclock::duration *d) 385 | { 386 | size_t sizeBytes = _length * sizeof(uint32_t); 387 | 388 | hc::am_copy(_inputAccPtr, _input, sizeBytes); 389 | 390 | /* 391 | * Extract the hsaQueue from the HCC acclerator_view: 392 | */ 393 | hc::accelerator_view av = _acc.get_default_view(); 394 | hsa_queue_t *hsaQueue = static_cast (av.get_hsa_queue()); 395 | 396 | hsa_status_t hsa_status = HSA_STATUS_SUCCESS; 397 | 398 | 399 | /* 400 | * Get a signal 401 | */ 402 | hsa_signal_t signal; 403 | if (p_optPreallocSignal) { 404 | signal = _signal; 405 | } else { 406 | hsa_status = hsa_signal_create(1, 0, NULL, &signal); 407 | assert(HSA_STATUS_SUCCESS == hsa_status); 408 | } 409 | 410 | 411 | /* 412 | * Setup dispatch packet. 413 | */ 414 | hsa_kernel_dispatch_packet_t aql; 415 | memset(&aql, 0, sizeof(aql)); 416 | 417 | const int kNumDimension = 1; 418 | aql.completion_signal.handle = 0x0; 419 | aql.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 420 | aql.workgroup_size_x = (uint16_t) 256; 421 | aql.workgroup_size_y = 1; 422 | aql.workgroup_size_z = 1; 423 | aql.grid_size_x = (uint32_t) (_length / 2); 424 | aql.grid_size_y = 1; 425 | aql.grid_size_z = 1; 426 | if (!p_optFence) { 427 | aql.header = 428 | (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 429 | (1 << HSA_PACKET_HEADER_BARRIER) | 430 | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | 431 | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); 432 | } 433 | aql.group_segment_size = 0; 434 | aql.private_segment_size = 0; 435 | aql.kernel_object = _codeHandle; // set at initialization time. 436 | 437 | 438 | // Require this be pre-allocated: 439 | assert(p_optPreallocKernarg); 440 | 441 | hsa_signal_store_relaxed(signal, 1); 442 | 443 | 444 | const uint32_t queueSize = hsaQueue->size; 445 | const uint32_t queueMask = queueSize - 1; 446 | 447 | 448 | auto kernel_start_time = myclock::now(); 449 | 450 | const int alignedArgSize = (sizeof(BitonicSort_args_t) + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE * CACHE_LINE_SIZE; 451 | int kernelCount = 0; 452 | int numKernels = _numStages * (_numStages + 1) / 2; 453 | 454 | // Make sure we have room in the queue for all of the kernels: 455 | 456 | uint64_t writeIndex = hsa_queue_load_write_index_relaxed(hsaQueue); 457 | uint64_t readIndex = hsa_queue_load_read_index_relaxed(hsaQueue); 458 | 459 | // Check once to make sure we have room for all the kernels: 460 | uint32_t availQueueEntries = (queueSize - (writeIndex - readIndex)); 461 | if (availQueueEntries < numKernels) { 462 | printf ("Error - queue full! queueSize=%d readIndex=%lu writeIndex=%lu\n", queueSize, readIndex, writeIndex); 463 | assert(0); 464 | } 465 | 466 | 467 | for(int stage = 0; stage < _numStages; ++stage) { 468 | for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) { 469 | /* 470 | * Write the args directly into the kernargs buffer: 471 | * Typecast the kernarg pointer to BitonicSort_args_t so arg-setting code can use the structure fields: 472 | */ 473 | aql.kernarg_address = static_cast (& (_kernargPointer[kernelCount * alignedArgSize ])); 474 | BitonicSort_args_t * args = (BitonicSort_args_t*) (aql.kernarg_address); 475 | args->theArray = _inputAccPtr; 476 | args->stage = stage; 477 | args->passOfStage = passOfStage; 478 | args->direction = _sortIncreasing; 479 | 480 | kernelCount++; 481 | 482 | assert (kernelCount <= numKernels); 483 | 484 | if (p_optFence) { 485 | aql.header = 486 | (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 487 | (1 << HSA_PACKET_HEADER_BARRIER); 488 | bool setFence=false; 489 | if (kernelCount == 1) { 490 | // first packet needs to acquire from system to make sure it gets the host->device copy: 491 | aql.header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); 492 | aql.header |= (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); 493 | setFence = true; 494 | } 495 | if (kernelCount == numKernels) { 496 | // last packet needs to release to system to make sure data is visible for device->host copy: 497 | aql.header |= (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); 498 | aql.header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); 499 | setFence = true; 500 | } 501 | if (!setFence) { 502 | // fences at agent scope: 503 | aql.header |= (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); 504 | aql.header |= (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); 505 | } 506 | } 507 | 508 | 509 | if (p_printProgress) { 510 | printf (" launching kernel#%d for stage=%d pass=%d\n", kernelCount, stage, passOfStage); 511 | } 512 | 513 | if (kernelCount == numKernels) { 514 | aql.completion_signal = signal; 515 | if (p_printProgress) { 516 | printf (" base_address))[writeIndex & queueMask] = aql; 524 | 525 | hsa_queue_store_write_index_relaxed(hsaQueue, writeIndex + 1); 526 | 527 | // Ring door bell. 528 | hsa_signal_store_relaxed(hsaQueue->doorbell_signal, writeIndex); 529 | } 530 | } 531 | }; 532 | 533 | // Host wait for last kernel to finish: 534 | if (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), 535 | HSA_WAIT_STATE_ACTIVE) != 0) { 536 | printf("Signal wait returned unexpected value\n"); 537 | assert(0); 538 | } 539 | 540 | auto kernel_end_time = myclock::now(); 541 | *d = (kernel_end_time - kernel_start_time); 542 | 543 | if (!p_optPreallocSignal) { 544 | hsa_signal_destroy(signal); 545 | } 546 | if (!p_optPreallocKernarg) { 547 | hsa_memory_free(aql.kernarg_address); 548 | } 549 | 550 | hc::am_copy(_input, _inputAccPtr, sizeBytes); 551 | } 552 | 553 | 554 | myclock::duration BitonicSort::run() 555 | { 556 | myclock::duration total_kernel_time(0), oneiter_kernel_time; 557 | for (int i=0; i (d).count() 607 | << " us"; 608 | 609 | if (printPerIter) { 610 | cout << " (" 611 | << chrono::duration_cast (d/p_iterations).count() 612 | << " us/iteration)"; 613 | } 614 | 615 | cout << endl; 616 | } 617 | 618 | // Helper function for CPU implementation: 619 | void 620 | swapIfFirstIsGreater(uint32_t *a, uint32_t *b) 621 | { 622 | if(*a > *b) 623 | { 624 | uint32_t temp = *a; 625 | *a = *b; 626 | *b = temp; 627 | } 628 | } 629 | 630 | 631 | /* 632 | * sorts the input array (in place) using the bitonic sort algorithm 633 | * sorts in increasing order if sortIncreasing is true 634 | * else sorts in decreasing order 635 | * length specifies the length of the array 636 | */ 637 | void 638 | BitonicSort::bitonicSortCPUReference( 639 | uint32_t * input, 640 | const uint32_t length, 641 | const bool sortIncreasing) 642 | { 643 | const uint32_t halfLength = length/2; 644 | 645 | uint32_t i; 646 | for(i = 2; i <= length; i *= 2) 647 | { 648 | uint32_t j; 649 | for(j = i; j > 1; j /= 2) 650 | { 651 | bool increasing = sortIncreasing; 652 | const uint32_t half_j = j/2; 653 | 654 | uint32_t k; 655 | for(k = 0; k < length; k += j) 656 | { 657 | const uint32_t k_plus_half_j = k + half_j; 658 | uint32_t l; 659 | 660 | if(i < length) 661 | { 662 | if((k == i) || (((k % i) == 0) && (k != halfLength))) 663 | { 664 | increasing = !increasing; 665 | } 666 | } 667 | 668 | for(l = k; l < k_plus_half_j; ++l) 669 | { 670 | if(increasing) 671 | { 672 | swapIfFirstIsGreater(&input[l], &input[l + half_j]); 673 | } 674 | else 675 | { 676 | swapIfFirstIsGreater(&input[l + half_j], &input[l]); 677 | } 678 | } 679 | } 680 | } 681 | } 682 | } 683 | 684 | 685 | void BitonicSort::verifyResults() 686 | { 687 | // Run on CPU: 688 | auto cpu_start_time = myclock::now(); 689 | for (int i=0; i to allocate memory (default uses hc::am_alloc).\n"); 720 | printf ("--loadKernelFromFile Load HSACO from file (rather than use embedded HSACO string)\n"); 721 | exit(0); 722 | } 723 | 724 | 725 | int parseInt(const char *str, int *output) 726 | { 727 | char *next; 728 | *output = strtol(str, &next, 0); 729 | return !strlen(next); 730 | } 731 | 732 | 733 | void parseArguments(int argc, char *argv[]) 734 | { 735 | for (int i = 1; i < argc; i++) { 736 | const char *arg = argv[i]; 737 | 738 | if (!strcmp(arg, "--help")) { 739 | printHelp(); 740 | } else if (!strcmp(arg, "--opt")) { 741 | p_optPreallocSignal = true; 742 | p_optPreallocKernarg = true; 743 | p_optAvoidHostSync = true; 744 | p_optFence = true; 745 | p_optPinnedHost = true; 746 | } else if (!strcmp(arg, "--optPreallocSignal")) { 747 | p_optPreallocSignal = true; 748 | } else if (!strcmp(arg, "--optPreallocKernarg")) { 749 | p_optPreallocKernarg = true; 750 | } else if (!strcmp(arg, "--optAvoidHostSync")) { 751 | p_optAvoidHostSync = true; 752 | } else if (!strcmp(arg, "--optFence")) { 753 | p_optFence = true; 754 | } else if (!strcmp(arg, "--optPinnedHost")) { 755 | p_optPinnedHost = true; 756 | } else if (!strcmp(arg, "--printProgress")) { 757 | p_printProgress = true; 758 | } else if (!strcmp(arg, "--useHcArray")) { 759 | p_useHcArray = true; 760 | } else if (!strcmp(arg, "--loadKernelFromFile")) { 761 | p_loadKernelFromFile = true; 762 | } else if (!strcmp(arg, "--iterations")) { 763 | if (++i >= argc || !parseInt(argv[i], &p_iterations)) { 764 | printf("Bad iterations argument\n"); 765 | assert(0); 766 | } 767 | } else { 768 | printf ("error: bad argument '%s'\n", arg); 769 | assert(0); 770 | } 771 | }; 772 | } 773 | 774 | 775 | int 776 | main(int argc, char * argv[]) 777 | { 778 | parseArguments(argc, argv); 779 | printf ("optimizations: p_optPreallocSignal=%d p_optPreallocKernarg=%d p_optAvoidHostSync=%d p_optFence=%d p_optPinnedHost=%d\n", 780 | p_optPreallocSignal, p_optPreallocKernarg, p_optAvoidHostSync, p_optFence, p_optPinnedHost); 781 | printf ("\n"); 782 | 783 | // Create a class to stack state (kernels, queues, args, etc) between the different functions: 784 | BitonicSort bs((hc::accelerator())); 785 | 786 | auto start_time = myclock::now(); 787 | 788 | bs.setup(); 789 | 790 | auto setup_end_time = myclock::now(); 791 | myclock::duration total_kernel_time = bs.run(); 792 | 793 | auto run_end_time = myclock::now(); 794 | 795 | printf ("iterations=%d\n", p_iterations); 796 | printTime("GPU setup time", setup_end_time - start_time); 797 | printTime("GPU run time", run_end_time - setup_end_time, true); 798 | printTime("GPU run(kernel) time", total_kernel_time, true); 799 | printTime("GPU setup+run time", run_end_time - start_time); 800 | 801 | if (p_verify) { 802 | bs.verifyResults(); 803 | } 804 | 805 | 806 | bs.cleanup(); 807 | } 808 | --------------------------------------------------------------------------------