├── .clang-format ├── .github └── workflows │ └── ci-github-actions.yaml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake ├── DummyOpenMPRuntime.f90 ├── FortranHelpers.cmake ├── TestCXXOpenMPRuntime.cmake └── TestFortranOpenMPRuntime.cmake ├── hands-on ├── CMakeLists.txt ├── README.md ├── cleanup.sh ├── common │ └── timer.h ├── gemm │ ├── 0-gemmNN-serial │ │ ├── 0-gemmNN-serial.cpp │ │ └── Makefile │ ├── 0-gemmNT-serial │ │ ├── Makefile │ │ └── gemmNT-serial.cpp │ ├── 1-gemmNN-omp-thread │ │ ├── 1-gemmNN-omp-thread.cpp │ │ └── Makefile │ ├── 1-gemmNT-omp │ │ ├── Makefile │ │ └── gemmNT-omp.cpp │ ├── 2-gemmNN-omp-target │ │ ├── 2-gemmNN-omp-target.cpp │ │ └── Makefile │ ├── 2-gemmNT-omp-target │ │ ├── Makefile │ │ └── gemmNT-omp-target.cpp │ └── gemmNN │ │ ├── Makefile │ │ └── gemmNN.cpp ├── gemv │ ├── 01-gemv-omp │ │ ├── Makefile │ │ ├── gemv-omp.cpp │ │ └── gemv-omp.f90 │ ├── 02-gemv-omp-target │ │ ├── Makefile │ │ ├── gemv-omp-target.cpp │ │ └── gemv-omp-target.f90 │ ├── 03-gemv-omp-target-teams │ │ ├── Makefile │ │ ├── gemv-omp-target-teams.cpp │ │ └── gemv-omp-target-teams.f90 │ ├── 04-gemv-omp-target-reduction │ │ ├── Makefile │ │ ├── gemv-omp-target-reduction.cpp │ │ └── gemv-omp-target-reduction.f90 │ ├── 05-gemv-omp-target-split-parallel-for-reduction │ │ ├── Makefile │ │ └── gemv-omp-target-split-parallel-for-reduction.cpp │ ├── 51-gemv-omp-many-matrices │ │ ├── Makefile │ │ ├── gemv-omp-many-matrices.cpp │ │ └── gemv-omp-many-matrices.f90 │ ├── 52-gemv-omp-target-many-matrices-no-hierachy │ │ ├── Makefile │ │ ├── gemv-omp-target-many-matrices-no-hierachy.cpp │ │ └── gemv-omp-target-many-matrices-no-hierachy.f90 │ ├── 53-gemv-omp-target-many-matrices │ │ ├── Makefile │ │ ├── gemv-omp-target-many-matrices.cpp │ │ └── gemv-omp-target-many-matrices.f90 │ ├── 54-gemv-omp-target-many-matrices-multi-devices │ │ ├── Makefile │ │ └── gemv-omp-target-many-matrices-multi-devices.cpp │ ├── 55-gemv-omp-target-many-matrices-taskloop │ │ ├── Makefile │ │ └── gemv-omp-target-many-matrices-taskloop.cpp │ ├── CMakeLists.txt │ ├── README │ └── build_and_run_all.sh ├── make.aomp.inc ├── make.gcc-nv.inc ├── make.icx.inc ├── make.llvm.inc └── make.xl.inc ├── integration └── crusher_recipe │ ├── README │ └── modules │ └── cray-mpich-afar.lua └── tests ├── CMakeLists.txt ├── allocator ├── CMakeLists.txt └── omp_pteam_mem_alloc.cpp ├── complex ├── CMakeLists.txt ├── complex.cpp ├── complex.f90 ├── complex_reduction.cpp └── complex_reduction_cpu.cpp ├── cudafor_omp ├── README.md ├── noomp.f90 ├── omp_above.f90 └── omp_below.f90 ├── fortran_allocator ├── CMakeLists.txt ├── dualspace.f90 ├── dualspace_array_device.f90 ├── dualspace_array_device_isptr.f90 └── dualspace_array_resize.f90 ├── fortran_use_device_ptr ├── CMakeLists.txt └── use_device_ptr_target.f90 ├── global_variable ├── CMakeLists.txt ├── constexpr │ ├── CMakeLists.txt │ └── constexpr.cpp ├── global_pointer │ ├── CMakeLists.txt │ ├── Makefile │ ├── global.cpp │ ├── global.h │ └── main.cpp └── global_static │ ├── CMakeLists.txt │ ├── Makefile │ ├── data.cpp │ ├── data.hpp │ └── main.cpp ├── implict_async ├── CMakeLists.txt ├── llvm_alloc_host.cpp └── llvm_alloc_host_data.cpp ├── linking ├── CMakeLists.txt ├── link_static_fat_bin │ ├── CMakeLists.txt │ ├── classA.cpp │ ├── classA.h │ ├── compile-amd.sh │ ├── compile-x86.sh │ ├── compile.sh │ └── main.cpp ├── linker_outlined_func │ ├── CMakeLists.txt │ ├── a.cpp │ ├── ab.h │ ├── b.cpp │ ├── compile.sh │ ├── compute.h │ └── main.cpp ├── missing_bundles │ ├── boo.cpp │ ├── compile.sh │ ├── foo.cpp │ └── main.cpp └── two_identical_templates │ ├── CMakeLists.txt │ ├── main.cpp │ ├── test_a.cpp │ └── test_b.cpp ├── map ├── CMakeLists.txt ├── check_transfer.cpp ├── declare_target_global.cpp ├── first_private_this_wrong.cpp ├── implicit_map_alloc.f90 ├── map_class_member.cpp ├── map_delete_inside_data.cpp ├── map_threads.cpp ├── pointer_api.cpp ├── struct_with_const.cpp └── this_with_virtual.cpp ├── math ├── CMakeLists.txt ├── FP_ZERO.cpp ├── README ├── header_only.cpp ├── modf.cpp ├── modf_in_branch.cpp ├── modf_team.cpp ├── sin_cos.cpp ├── sin_simd.cpp ├── sincos.cpp ├── sincos_simd.cpp ├── sincos_simd_template.cpp └── sqrt_simd.cpp ├── omphost ├── CMakeLists.txt ├── README.md └── host_bug_libomp.cpp ├── private ├── CMakeLists.txt ├── run_all.sh ├── target__teams__distribute_private.cpp ├── target__teams_distribute_private.cpp ├── target__teams_private__distribute.cpp ├── target_local_block.f90 ├── target_teams__distribute_private.cpp ├── target_teams_distribute_parallel_for_private.cpp ├── target_teams_distribute_parallel_for_private.f90 ├── target_teams_distribute_private.cpp ├── target_teams_distribute_private.f90 └── target_teams_private__distribute.cpp ├── reduction ├── CMakeLists.txt ├── README.md └── array_reduction.cpp ├── sollve_vv └── sollve_vv_aomp.sh ├── target_task ├── CMakeLists.txt ├── omp-task-bug.cpp ├── target_nowait_task.cpp ├── target_nowait_taskwait.cpp ├── target_taskwait.cpp ├── target_update_nowait_taskwait.cpp ├── taskloop.cpp └── taskloop_offload_nowait.cpp └── tasks └── implicit_shared.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -2 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveAssignments: true 6 | AlignConsecutiveDeclarations: false 7 | AlignEscapedNewlines: Left 8 | AlignOperands: false 9 | AlignTrailingComments: true 10 | AllowAllParametersOfDeclarationOnNextLine: false 11 | AllowShortBlocksOnASingleLine: true 12 | AllowShortCaseLabelsOnASingleLine: false 13 | AllowShortFunctionsOnASingleLine: All 14 | AllowShortIfStatementsOnASingleLine: false 15 | AllowShortLoopsOnASingleLine: false 16 | AlwaysBreakAfterDefinitionReturnType: None 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: false 19 | AlwaysBreakTemplateDeclarations: true 20 | BinPackArguments: true 21 | BinPackParameters: false 22 | BraceWrapping: 23 | AfterClass: true 24 | AfterControlStatement: true 25 | AfterEnum: true 26 | AfterFunction: true 27 | AfterNamespace: true 28 | AfterObjCDeclaration: true 29 | AfterStruct: true 30 | AfterUnion: true 31 | AfterExternBlock: true 32 | BeforeCatch: true 33 | BeforeElse: true 34 | IndentBraces: false 35 | SplitEmptyFunction: false 36 | SplitEmptyRecord: false 37 | SplitEmptyNamespace: false 38 | BreakBeforeBinaryOperators: None 39 | BreakBeforeBraces: Custom 40 | BreakBeforeInheritanceComma: false 41 | BreakBeforeTernaryOperators: true 42 | BreakConstructorInitializersBeforeComma: false 43 | BreakConstructorInitializers: BeforeColon 44 | BreakAfterJavaFieldAnnotations: false 45 | BreakStringLiterals: true 46 | ColumnLimit: 120 47 | CommentPragmas: '^ IWYU pragma:' 48 | CompactNamespaces: false 49 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 50 | ConstructorInitializerIndentWidth: 4 51 | ContinuationIndentWidth: 4 52 | Cpp11BracedListStyle: true 53 | DerivePointerAlignment: false 54 | DisableFormat: false 55 | ExperimentalAutoDetectBinPacking: false 56 | FixNamespaceComments: true 57 | ForEachMacros: 58 | - foreach 59 | - Q_FOREACH 60 | - BOOST_FOREACH 61 | IncludeBlocks: Preserve 62 | IncludeCategories: 63 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 64 | Priority: 2 65 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 66 | Priority: 3 67 | - Regex: '.*' 68 | Priority: 1 69 | IncludeIsMainRegex: '(Test)?$' 70 | IndentCaseLabels: false 71 | IndentPPDirectives: None 72 | IndentWidth: 2 73 | IndentWrappedFunctionNames: true 74 | JavaScriptQuotes: Leave 75 | JavaScriptWrapImports: true 76 | KeepEmptyLinesAtTheStartOfBlocks: false 77 | MacroBlockBegin: '' 78 | MacroBlockEnd: '' 79 | MaxEmptyLinesToKeep: 2 80 | NamespaceIndentation: None 81 | ObjCBlockIndentWidth: 2 82 | ObjCSpaceAfterProperty: false 83 | ObjCSpaceBeforeProtocolList: true 84 | PenaltyBreakAssignment: 2 85 | PenaltyBreakBeforeFirstCallParameter: 30000 86 | PenaltyBreakComment: 300 87 | PenaltyBreakFirstLessLess: 120 88 | PenaltyBreakString: 1000 89 | PenaltyExcessCharacter: 1000000 90 | PenaltyReturnTypeOnItsOwnLine: 10000 91 | PointerAlignment: Left 92 | ReflowComments: false 93 | SortIncludes: false 94 | SortUsingDeclarations: true 95 | SpaceAfterCStyleCast: false 96 | SpaceAfterTemplateKeyword: false 97 | SpaceBeforeAssignmentOperators: true 98 | SpaceBeforeParens: ControlStatements 99 | SpaceInEmptyParentheses: false 100 | SpacesBeforeTrailingComments: 1 101 | SpacesInAngles: false 102 | SpacesInContainerLiterals: true 103 | SpaceBeforeCtorInitializerColon: true 104 | SpaceBeforeInheritanceColon: true 105 | SpaceBeforeRangeBasedForLoopColon: true 106 | SpaceInEmptyParentheses: false 107 | SpacesInCStyleCastParentheses: false 108 | SpacesInParentheses: false 109 | SpacesInSquareBrackets: false 110 | Standard: Cpp11 111 | TabWidth: 8 112 | UseTab: Never 113 | ... 114 | 115 | -------------------------------------------------------------------------------- /.github/workflows/ci-github-actions.yaml: -------------------------------------------------------------------------------- 1 | name: GitHub Actions CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | 13 | linux: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | 18 | steps: 19 | - name: Checkout Action 20 | uses: actions/checkout@v3 21 | 22 | - name: Setup Dependencies 23 | run: | 24 | sudo apt install g++-10 libgomp1 25 | 26 | - name: Configure 27 | run: | 28 | mkdir build_gcc_noomp; cd build_gcc_noomp 29 | cmake -DCMAKE_CXX_COMPILER=g++-10 -DCMAKE_Fortran_COMPILER=gfortran-10 .. 30 | 31 | - name: Build 32 | run: cd build_gcc_noomp; make -j2 -k 33 | 34 | - name: Test 35 | run: cd build_gcc_noomp; ctest --output-on-failure 36 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | 3 | # set the project name 4 | project(openmp-target LANGUAGES NONE) 5 | 6 | OPTION(ENABLE_CXX "Enable/disable C++ tests" ON) 7 | OPTION(ENABLE_Fortran "Enable/disable Fortran tests" ON) 8 | OPTION(ENABLE_EXPERIMENTAL "Enable/disable experimental tests" OFF) 9 | 10 | if (NOT CMAKE_BUILD_TYPE) 11 | set(CMAKE_BUILD_TYPE Release) 12 | endif() 13 | 14 | if (ENABLE_CXX) 15 | enable_language(CXX) 16 | # requires C++14 standard 17 | set(CMAKE_CXX_STANDARD 14) 18 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 19 | set(CMAKE_CXX_EXTENSIONS OFF) 20 | include(cmake/TestCXXOpenMPRuntime.cmake) 21 | endif() 22 | 23 | if (ENABLE_Fortran) 24 | enable_language(Fortran) 25 | include(cmake/FortranHelpers.cmake) 26 | include(cmake/TestFortranOpenMPRuntime.cmake) 27 | endif() 28 | 29 | enable_testing() 30 | 31 | if (CMAKE_CXX_COMPILER_ID MATCHES "PGI" OR CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") 32 | add_definitions(-D__NO_UDR) 33 | endif() 34 | 35 | add_subdirectory(hands-on) 36 | add_subdirectory(tests) 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The 3-Clause BSD License 2 | Copyright (c) 2019-2021 Ye Luo. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A collection of OpenMP tests for C++ and Fortran compilers 2 | 3 | Recipe example 4 | ``` 5 | mkdir build_gcc_omp 6 | cd build_gcc_omp 7 | cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran \ 8 | -DCMAKE_CXX_FLAGS=-fopenmp -DCMAKE_Fortran_FLAGS=-fopenmp \ 9 | .. 10 | make -k -j 16 11 | ctest 12 | ``` 13 | 14 | ### 15 | OpenMP offload compiler options 16 | https://github.com/ye-luo/openmp-target/wiki/OpenMP-offload-compilers 17 | -------------------------------------------------------------------------------- /cmake/DummyOpenMPRuntime.f90: -------------------------------------------------------------------------------- 1 | module omp_lib 2 | contains 3 | function omp_get_num_threads() 4 | implicit none 5 | integer omp_get_num_threads 6 | omp_get_num_threads = 1 7 | end function omp_get_num_threads 8 | 9 | function omp_get_num_teams() 10 | implicit none 11 | integer omp_get_num_teams 12 | omp_get_num_teams = 1 13 | end function omp_get_num_teams 14 | 15 | function omp_get_thread_num() 16 | implicit none 17 | integer omp_get_thread_num 18 | omp_get_thread_num = 0 19 | end function omp_get_thread_num 20 | 21 | function omp_get_team_num() 22 | implicit none 23 | integer omp_get_team_num 24 | omp_get_team_num = 0 25 | end function omp_get_team_num 26 | end module 27 | -------------------------------------------------------------------------------- /cmake/FortranHelpers.cmake: -------------------------------------------------------------------------------- 1 | function(fix_fortran_modules TGT) 2 | set(targets ${TGT} ${ARGN}) 3 | foreach(tgt IN LISTS targets) 4 | get_target_property(tgt_type ${tgt} TYPE) 5 | # All of the following target modifications make 6 | # sense on non-interfaces only 7 | if(NOT ${tgt_type} STREQUAL "INTERFACE_LIBRARY") 8 | get_target_property(tgt_module_dir ${tgt} Fortran_MODULE_DIRECTORY) 9 | # set module path to tgt_binary_dir/mod 10 | get_target_property(tgt_binary_dir ${tgt} BINARY_DIR) 11 | set_target_properties(${tgt} 12 | PROPERTIES 13 | Fortran_MODULE_DIRECTORY ${tgt_binary_dir}/mod/${TGT}) 14 | # make module directory available for clients of TGT 15 | target_include_directories(${tgt} 16 | PUBLIC 17 | $ 18 | INTERFACE 19 | $) 20 | endif() 21 | endforeach() 22 | endfunction(fix_fortran_modules) 23 | -------------------------------------------------------------------------------- /cmake/TestCXXOpenMPRuntime.cmake: -------------------------------------------------------------------------------- 1 | include(CheckCXXSourceCompiles) 2 | 3 | list(PREPEND OPENMP_CXX_COMPILE_OPTIONS ${OPENMP_CXX_FLAGS} ${OPENMP_FLAGS}) 4 | list(PREPEND OPENMP_CXX_LINK_OPTIONS ${OPENMP_CXX_FLAGS} ${OPENMP_FLAGS}) 5 | 6 | set(CMAKE_REQUIRED_FLAGS ${OPENMP_CXX_COMPILE_OPTIONS}) 7 | #set(CMAKE_REQUIRED_LINK_OPTIONS ${OPENMP_CXX_LINK_OPTIONS}) 8 | 9 | check_cxx_source_compiles( 10 | "#include 11 | int main() 12 | { int a = omp_get_num_threads(); }" 13 | CXX_OPENMP_RUNTIME_OKAY 14 | ) 15 | 16 | add_library(qmc_openmp_cxx INTERFACE) 17 | 18 | if (CXX_OPENMP_RUNTIME_OKAY) 19 | message(STATUS "C++ OpenMP functionality check failed!") 20 | else() 21 | message(STATUS "C++ OpenMP functionality check pass") 22 | target_compile_options(qmc_openmp_cxx INTERFACE "${OPENMP_CXX_COMPILE_OPTIONS}") 23 | target_link_options(qmc_openmp_cxx INTERFACE "${OPENMP_CXX_LINK_OPTIONS}") 24 | endif() 25 | 26 | check_cxx_source_compiles( 27 | "#include 28 | int main() 29 | { int a = omp_target_is_present(nullptr, 0); }" 30 | CXX_OFFLOAD_RUNTIME_OKAY 31 | ) 32 | 33 | if (CXX_OFFLOAD_RUNTIME_OKAY) 34 | message(STATUS "CXX compiler has OpenMP offload runtime library.") 35 | else() 36 | message(STATUS "CXX compiler doesn't have OpenMP offload runtime library.") 37 | endif() 38 | -------------------------------------------------------------------------------- /cmake/TestFortranOpenMPRuntime.cmake: -------------------------------------------------------------------------------- 1 | set(TEST_OPENMP_RUNTIME_SOURCE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/try_openmp_runtime.f90) 2 | file(WRITE ${TEST_OPENMP_RUNTIME_SOURCE} 3 | "program test_open_runtime 4 | use omp_lib 5 | implicit none 6 | integer :: num 7 | num = omp_get_thread_num() 8 | end program 9 | ") 10 | 11 | 12 | try_compile(Fortran_OPENMP_RUNTIME_OKAY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp 13 | ${TEST_OPENMP_RUNTIME_SOURCE} 14 | OUTPUT_VARIABLE COMPILE_OUTPUT) 15 | 16 | if (NOT Fortran_OPENMP_RUNTIME_OKAY) 17 | set(COMPILE_FAIL_OUTPUT fortran_openmp_runtime_compile_fail.txt) 18 | file(WRITE "${CMAKE_BINARY_DIR}/${COMPILE_FAIL_OUTPUT}" "${COMPILE_OUTPUT}") 19 | message(STATUS "Fortran OpenMP functionality check failed!" 20 | "See compiler output at ${COMPILE_FAIL_OUTPUT}") 21 | add_library(dummy_openmp_runtime cmake/DummyOpenMPRuntime.f90) 22 | fix_fortran_modules(dummy_openmp_runtime) 23 | else() 24 | add_library(dummy_openmp_runtime INTERFACE) 25 | message(STATUS "Fortran OpenMP functionality check pass") 26 | endif() 27 | -------------------------------------------------------------------------------- /hands-on/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(gemv) 2 | -------------------------------------------------------------------------------- /hands-on/README.md: -------------------------------------------------------------------------------- 1 | Before building any exectuable, create make.inc under hands-on based on your machine. 2 | Use make.clang-ykt.inc as an example. 3 | -------------------------------------------------------------------------------- /hands-on/cleanup.sh: -------------------------------------------------------------------------------- 1 | find -name "*.x" -exec rm {} \; 2 | find -name "fetchnode.*" -exec rm {} \; 3 | -------------------------------------------------------------------------------- /hands-on/common/timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | class Timer 6 | { 7 | const std::chrono::time_point start; 8 | const std::string name; 9 | 10 | public: 11 | Timer(const std::string& name_in): start(std::chrono::system_clock::now()), name(name_in) {}; 12 | ~Timer() 13 | { 14 | auto end = std::chrono::system_clock::now(); 15 | std::cout << "Function " << name 16 | << " takes " << std::chrono::duration_cast>(end - start).count() 17 | << " us" << std::endl; 18 | } 19 | }; 20 | -------------------------------------------------------------------------------- /hands-on/gemm/0-gemmNN-serial/0-gemmNN-serial.cpp: -------------------------------------------------------------------------------- 1 | #define N 3000 2 | #include "timer.h" 3 | 4 | /* 5 | Multiplies two matrices of dimension n x n and passes back resulting matrix. 6 | */ 7 | template 8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result) 9 | { 10 | for (int row = 0; row < n; row++) 11 | for (int col = 0; col < n; col++) 12 | { 13 | const T* __restrict__ A_row = A + row * n; 14 | T sum(0); 15 | const T* __restrict__ B_col = B + col; 16 | for(int i = 0; i < n; i++) 17 | { 18 | sum += A_row[i] * B_col[i * n]; 19 | } 20 | const int index = (row * n) + col; 21 | result[index] = sum * alpha; 22 | } 23 | } 24 | 25 | /* 26 | Prints 1 dimensional matrix of dimension n x n. 27 | */ 28 | template 29 | void printMatrix(int n, T* __restrict__ A) 30 | { 31 | for(int i = 0; i < n * n; i++) 32 | std::cout << A[i]; 33 | } 34 | 35 | /* 36 | Creates 1 dimensional matrix of size n and fills with T(1). 37 | */ 38 | template 39 | T* allocate(size_t n) 40 | { 41 | T* ptr = new T[n]; 42 | std::fill_n(ptr, n, T(1)); 43 | return ptr; 44 | } 45 | 46 | /* 47 | Frees up space from 1 dimensional matrix. 48 | */ 49 | template 50 | void deallocate(T* ptr, size_t n) 51 | { 52 | delete[] ptr; 53 | } 54 | 55 | void testtbt() 56 | { 57 | std::cout << "Testing 3x3 matrix multiplication.\n"; 58 | int dim = 3; 59 | auto* C = allocate(dim * dim); 60 | auto* D = allocate(dim * dim); 61 | auto* R = allocate(dim * dim); 62 | std::cout << "Result calculated by hand: 010202010\n"; 63 | for(int i = 0; i < dim * dim; i++) 64 | { 65 | if( i % 2 == 0) 66 | { 67 | C[i] = 0; 68 | D[i] = 1; 69 | } else 70 | { 71 | C[i] = 1; 72 | D[i] = 0; 73 | } 74 | } 75 | 76 | gemv(dim, 1.0f, C, D, R); 77 | 78 | std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n"; 79 | std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n"; 80 | std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n"; 81 | 82 | deallocate(C, dim * dim); 83 | deallocate(D, dim * dim); 84 | deallocate(R, dim * dim); 85 | } 86 | 87 | int main() 88 | { 89 | auto* A = allocate(N * N); 90 | auto* B = allocate(N * N); 91 | auto* result = allocate(N * N); 92 | 93 | // Debugging 94 | //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n"; 95 | //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n"; 96 | 97 | Timer local("GEMV"); 98 | gemv(N, 1.0f, A, B, result); 99 | 100 | // testtbt(); 101 | 102 | // Debugging 103 | //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n"; 104 | 105 | deallocate(A, N * N); 106 | deallocate(B, N * N); 107 | deallocate(result, N * N); 108 | 109 | } 110 | -------------------------------------------------------------------------------- /hands-on/gemm/0-gemmNN-serial/Makefile: -------------------------------------------------------------------------------- 1 | name=0-gemmNN-serial 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/0-gemmNT-serial/Makefile: -------------------------------------------------------------------------------- 1 | name=gemmNT-serial 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -march=native -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/0-gemmNT-serial/gemmNT-serial.cpp: -------------------------------------------------------------------------------- 1 | #define N 1024 2 | #include "timer.h" 3 | 4 | template 5 | void gemmT(int n, const T* restrict A, const T* restrict B, T* restrict C) 6 | { 7 | for(int rowA=0; rowA 20 | T* allocate(int n) 21 | { 22 | T* ptr = new T[n]; 23 | std::fill_n(ptr, n, T(1)); 24 | return ptr; 25 | } 26 | 27 | template 28 | void deallocate(T* ptr, int n) 29 | { 30 | delete[] ptr; 31 | } 32 | 33 | int main() 34 | { 35 | auto* A = allocate(N*N); 36 | auto* B = allocate(N*N); 37 | auto* C = allocate(N*N); 38 | 39 | { 40 | Timer local("GEMMT"); 41 | gemmT(N, A, B, C); 42 | } 43 | 44 | deallocate(A, N*N); 45 | deallocate(B, N*N); 46 | deallocate(C, N*N); 47 | } 48 | -------------------------------------------------------------------------------- /hands-on/gemm/1-gemmNN-omp-thread/1-gemmNN-omp-thread.cpp: -------------------------------------------------------------------------------- 1 | #define N 3000 2 | #include "timer.h" 3 | 4 | /* 5 | Multiplies two matrices of dimension n x n and passes back resulting matrix. 6 | */ 7 | template 8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result) 9 | { 10 | #pragma omp parallel for collapse(2) 11 | // target works with teams and map to offload data to GPU 12 | // teams distribute breaks execution of loops into teams of threads 13 | // map:to offloads data to GPU 14 | // map:from writes date from GPU to devide 15 | for (int row = 0; row < n; row++) 16 | for (int col = 0; col < n; col++) 17 | { 18 | const T* __restrict__ A_row = A + row * n; 19 | T sum(0); 20 | const T* __restrict__ B_col = B + col; 21 | // can move pragma for here 22 | for(int i = 0; i < n; i++) 23 | { 24 | sum += A_row[i] * B_col[i * n]; 25 | } 26 | const int index = (row * n) + col; 27 | result[index] = sum * alpha; 28 | } 29 | } 30 | 31 | /* 32 | Prints 1 dimensional matrix of dimension n x n. 33 | */ 34 | template 35 | void printMatrix(int n, T* __restrict__ A) 36 | { 37 | for(int i = 0; i < n * n; i++) 38 | std::cout << A[i]; 39 | } 40 | 41 | /* 42 | Creates 1 dimensional matrix of size n and fills with T(1). 43 | */ 44 | template 45 | T* allocate(size_t n) 46 | { 47 | T* ptr = new T[n]; 48 | std::fill_n(ptr, n, T(1)); 49 | return ptr; 50 | } 51 | 52 | /* 53 | Frees up space from 1 dimensional matrix. 54 | */ 55 | template 56 | void deallocate(T* ptr, size_t n) 57 | { 58 | delete[] ptr; 59 | } 60 | 61 | void testtbt() 62 | { 63 | std::cout << "Testing 3x3 matrix multiplication.\n"; 64 | int dim = 3; 65 | auto* C = allocate(dim * dim); 66 | auto* D = allocate(dim * dim); 67 | auto* R = allocate(dim * dim); 68 | std::cout << "Result calculated by hand: 010202010\n"; 69 | for(int i = 0; i < dim * dim; i++) 70 | { 71 | if( i % 2 == 0) 72 | { 73 | C[i] = 0; 74 | D[i] = 1; 75 | } else 76 | { 77 | C[i] = 1; 78 | D[i] = 0; 79 | } 80 | } 81 | 82 | gemv(dim, 1.0f, C, D, R); 83 | 84 | std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n"; 85 | std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n"; 86 | std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n"; 87 | 88 | deallocate(C, dim * dim); 89 | deallocate(D, dim * dim); 90 | deallocate(R, dim * dim); 91 | } 92 | 93 | int main() 94 | { 95 | auto* A = allocate(N * N); 96 | auto* B = allocate(N * N); 97 | auto* result = allocate(N * N); 98 | 99 | // Debugging 100 | //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n"; 101 | //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n"; 102 | 103 | Timer local("GEMV"); 104 | gemv(N, 1.0f, A, B, result); 105 | 106 | // testtbt(); 107 | 108 | // Debugging 109 | //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n"; 110 | 111 | deallocate(A, N * N); 112 | deallocate(B, N * N); 113 | deallocate(result, N * N); 114 | 115 | } 116 | -------------------------------------------------------------------------------- /hands-on/gemm/1-gemmNN-omp-thread/Makefile: -------------------------------------------------------------------------------- 1 | name=1-gemmNN-omp-thread 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/1-gemmNT-omp/Makefile: -------------------------------------------------------------------------------- 1 | name=gemmNT-omp 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -march=native -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/1-gemmNT-omp/gemmNT-omp.cpp: -------------------------------------------------------------------------------- 1 | #define N 1024 2 | #include "timer.h" 3 | 4 | template 5 | void gemmT(int n, const T* restrict A, const T* restrict B, T* restrict C) 6 | { 7 | #pragma omp parallel for 8 | for(int rowA=0; rowA 21 | T* allocate(int n) 22 | { 23 | T* ptr = new T[n]; 24 | std::fill_n(ptr, n, T(1)); 25 | return ptr; 26 | } 27 | 28 | template 29 | void deallocate(T* ptr, int n) 30 | { 31 | delete[] ptr; 32 | } 33 | 34 | int main() 35 | { 36 | auto* A = allocate(N*N); 37 | auto* B = allocate(N*N); 38 | auto* C = allocate(N*N); 39 | 40 | { 41 | Timer local("GEMMT"); 42 | gemmT(N, A, B, C); 43 | } 44 | 45 | deallocate(A, N*N); 46 | deallocate(B, N*N); 47 | deallocate(C, N*N); 48 | } 49 | -------------------------------------------------------------------------------- /hands-on/gemm/2-gemmNN-omp-target/2-gemmNN-omp-target.cpp: -------------------------------------------------------------------------------- 1 | #define N 3000 2 | #include "timer.h" 3 | 4 | /* 5 | Multiplies two matrices of dimension n x n and passes back resulting matrix. 6 | */ 7 | template 8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result) 9 | { 10 | #pragma omp target teams distribute collapse(2) map(to:A[:n*n], B[:n*n]) map(from:result[:n*n]) 11 | // target works with teams and map to offload data to GPU 12 | // teams distribute breaks execution of loops into teams of threads 13 | // map:to offloads data to GPU 14 | // map:from writes date from GPU to devide 15 | for (int row = 0; row < n; row++) 16 | for (int col = 0; col < n; col++) 17 | { 18 | const T* __restrict__ A_row = A + row * n; 19 | T sum(0); 20 | const T* __restrict__ B_col = B + col; 21 | // can move pragma for here 22 | #pragma omp parallel for reduction(+:sum) 23 | for(int i = 0; i < n; i++) 24 | { 25 | sum += A_row[i] * B_col[i * n]; 26 | } 27 | const int index = (row * n) + col; 28 | result[index] = sum * alpha; 29 | } 30 | } 31 | 32 | /* 33 | Prints 1 dimensional matrix of dimension n x n. 34 | */ 35 | template 36 | void printMatrix(int n, T* __restrict__ A) 37 | { 38 | for(int i = 0; i < n * n; i++) 39 | std::cout << A[i]; 40 | } 41 | 42 | /* 43 | Creates 1 dimensional matrix of size n and fills with T(1). 44 | */ 45 | template 46 | T* allocate(size_t n) 47 | { 48 | T* ptr = new T[n]; 49 | std::fill_n(ptr, n, T(1)); 50 | return ptr; 51 | } 52 | 53 | /* 54 | Frees up space from 1 dimensional matrix. 55 | */ 56 | template 57 | void deallocate(T* ptr, size_t n) 58 | { 59 | delete[] ptr; 60 | } 61 | 62 | void testtbt() 63 | { 64 | std::cout << "Testing 3x3 matrix multiplication.\n"; 65 | int dim = 3; 66 | auto* C = allocate(dim * dim); 67 | auto* D = allocate(dim * dim); 68 | auto* R = allocate(dim * dim); 69 | std::cout << "Result calculated by hand: 010202010\n"; 70 | for(int i = 0; i < dim * dim; i++) 71 | { 72 | if( i % 2 == 0) 73 | { 74 | C[i] = 0; 75 | D[i] = 1; 76 | } else 77 | { 78 | C[i] = 1; 79 | D[i] = 0; 80 | } 81 | } 82 | 83 | gemv(dim, 1.0f, C, D, R); 84 | 85 | std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n"; 86 | std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n"; 87 | std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n"; 88 | 89 | deallocate(C, dim * dim); 90 | deallocate(D, dim * dim); 91 | deallocate(R, dim * dim); 92 | } 93 | 94 | int main() 95 | { 96 | auto* A = allocate(N * N); 97 | auto* B = allocate(N * N); 98 | auto* result = allocate(N * N); 99 | 100 | // Debugging 101 | //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n"; 102 | //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n"; 103 | 104 | Timer local("GEMV"); 105 | gemv(N, 1.0f, A, B, result); 106 | 107 | // testtbt(); 108 | 109 | // Debugging 110 | //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n"; 111 | 112 | deallocate(A, N * N); 113 | deallocate(B, N * N); 114 | deallocate(result, N * N); 115 | 116 | } 117 | -------------------------------------------------------------------------------- /hands-on/gemm/2-gemmNN-omp-target/Makefile: -------------------------------------------------------------------------------- 1 | name=2-gemmNN-omp-target 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/2-gemmNT-omp-target/Makefile: -------------------------------------------------------------------------------- 1 | name=gemmNT-omp-target 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/2-gemmNT-omp-target/gemmNT-omp-target.cpp: -------------------------------------------------------------------------------- 1 | #define N 1024 2 | #include "timer.h" 3 | 4 | template 5 | void gemmT(int n, const T* restrict A, const T* restrict B, T* restrict C) 6 | { 7 | #pragma omp target teams distribute parallel for collapse(2) map(to:A[:n*n], B[:n*n]) map(tofrom:C[:n*n]) 8 | for(int rowA=0; rowA 21 | T* allocate(int n) 22 | { 23 | T* ptr = new T[n]; 24 | std::fill_n(ptr, n, T(1)); 25 | #pragma omp target enter data map(to:ptr[:n]) 26 | return ptr; 27 | } 28 | 29 | template 30 | void deallocate(T* ptr, int n) 31 | { 32 | #pragma omp target exit data map(delete:ptr[:n]) 33 | delete[] ptr; 34 | } 35 | 36 | int main() 37 | { 38 | auto* A = allocate(N*N); 39 | auto* B = allocate(N*N); 40 | auto* C = allocate(N*N); 41 | 42 | { 43 | Timer local("GEMMT"); 44 | gemmT(N, A, B, C); 45 | } 46 | 47 | deallocate(A, N*N); 48 | deallocate(B, N*N); 49 | deallocate(C, N*N); 50 | } 51 | -------------------------------------------------------------------------------- /hands-on/gemm/gemmNN/Makefile: -------------------------------------------------------------------------------- 1 | name=gemmNN 2 | ${name}.x: ${name}.cpp 3 | clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $< 4 | 5 | .PHONY : clean 6 | clean : 7 | rm -f *.x 8 | -------------------------------------------------------------------------------- /hands-on/gemm/gemmNN/gemmNN.cpp: -------------------------------------------------------------------------------- 1 | #define N 1000 2 | #include "timer.h" 3 | 4 | /* 5 | Multiplies two matrices of dimension n x n and passes back resulting matrix. 6 | */ 7 | template 8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result) 9 | { 10 | #pragma omp target teams distribute collapse(2) map(to:A[:n*n], B[:n*n]) map(from:result[:n*n]) 11 | // target works with teams and map to offload data to GPU 12 | // teams distribute breaks execution of loops into teams of threads 13 | // map:to offloads data to GPU 14 | // map:from writes date from GPU to devide 15 | for (int row = 0; row < n; row++) 16 | for (int col = 0; col < n; col++) 17 | { 18 | const T* __restrict__ A_row = A + row * n; 19 | T sum(0); 20 | const T* __restrict__ B_col = B + col; 21 | // can move pragma for here 22 | #pragma omp parallel for reduction(+:sum) 23 | for(int i = 0; i < n; i++) 24 | { 25 | sum += A_row[i] * B_col[i * n]; 26 | } 27 | const int index = (row * n) + col; 28 | result[index] = sum * alpha; 29 | } 30 | } 31 | 32 | /* 33 | Prints 1 dimensional matrix of dimension n x n. 34 | */ 35 | template 36 | void printMatrix(int n, T* __restrict__ A) 37 | { 38 | for(int i = 0; i < n * n; i++) 39 | std::cout << A[i]; 40 | } 41 | 42 | /* 43 | Creates 1 dimensional matrix of size n and fills with T(1). 44 | */ 45 | template 46 | T* allocate(size_t n) 47 | { 48 | T* ptr = new T[n]; 49 | std::fill_n(ptr, n, T(1)); 50 | return ptr; 51 | } 52 | 53 | /* 54 | Frees up space from 1 dimensional matrix. 55 | */ 56 | template 57 | void deallocate(T* ptr, size_t n) 58 | { 59 | delete[] ptr; 60 | } 61 | 62 | void testtbt() 63 | { 64 | std::cout << "Testing 3x3 matrix multiplication.\n"; 65 | int dim = 3; 66 | auto* C = allocate(dim * dim); 67 | auto* D = allocate(dim * dim); 68 | auto* R = allocate(dim * dim); 69 | std::cout << "Result calculated by hand: 010202010\n"; 70 | for(int i = 0; i < dim * dim; i++) 71 | { 72 | if( i % 2 == 0) 73 | { 74 | C[i] = 0; 75 | D[i] = 1; 76 | } else 77 | { 78 | C[i] = 1; 79 | D[i] = 0; 80 | } 81 | } 82 | 83 | gemv(dim, 1.0f, C, D, R); 84 | 85 | std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n"; 86 | std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n"; 87 | std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n"; 88 | 89 | deallocate(C, dim * dim); 90 | deallocate(D, dim * dim); 91 | deallocate(R, dim * dim); 92 | } 93 | 94 | int main() 95 | { 96 | auto* A = allocate(N * N); 97 | auto* B = allocate(N * N); 98 | auto* result = allocate(N * N); 99 | 100 | // Debugging 101 | //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n"; 102 | //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n"; 103 | 104 | Timer local("GEMV"); 105 | gemv(N, 1.0f, A, B, result); 106 | 107 | // testtbt(); 108 | 109 | // Debugging 110 | //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n"; 111 | 112 | deallocate(A, N * N); 113 | deallocate(B, N * N); 114 | deallocate(result, N * N); 115 | 116 | } 117 | -------------------------------------------------------------------------------- /hands-on/gemv/01-gemv-omp/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/01-gemv-omp/gemv-omp.cpp: -------------------------------------------------------------------------------- 1 | #define N 8192 2 | #include "timer.h" 3 | 4 | template 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 6 | { 7 | #pragma omp parallel for 8 | for (int row = 0; row < n; row++) 9 | { 10 | T sum = T(0); 11 | const T* __restrict__ A_row = A + row * n; 12 | for (int col = 0; col < n; col++) 13 | sum += A_row[col] * V[col]; 14 | Vout[row] = sum * alpha; 15 | } 16 | } 17 | 18 | template 19 | T* allocate(size_t n) 20 | { 21 | T* ptr = new T[n]; 22 | std::fill_n(ptr, n, T(1)); 23 | return ptr; 24 | } 25 | 26 | template 27 | void deallocate(T* ptr, size_t n) 28 | { 29 | delete[] ptr; 30 | } 31 | 32 | int main() 33 | { 34 | auto* A = allocate(N * N); 35 | auto* V = allocate(N); 36 | auto* Vout = allocate(N); 37 | 38 | { 39 | Timer local("GEMV"); 40 | gemv(N, 1.0f, A, V, Vout); 41 | } 42 | 43 | for (int i = 0; i < N; i++) 44 | if (Vout[i] != N) 45 | { 46 | std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl; 47 | #if defined(THROW_FAIL) 48 | throw; 49 | #else 50 | break; 51 | #endif 52 | } 53 | 54 | deallocate(A, N * N); 55 | deallocate(V, N); 56 | deallocate(Vout, N); 57 | } 58 | -------------------------------------------------------------------------------- /hands-on/gemv/01-gemv-omp/gemv-omp.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:),V(:),Vout(:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: val 12 | 13 | !!starts here 14 | call system_clock(ti,tk) 15 | allocate(A(1:N*N),stat=err) 16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 17 | allocate(V(1:N),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 19 | allocate(Vout(1:N),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 21 | 22 | A(:) = 1.0 23 | V(:) = 1.0 24 | call gemv(N,alpha,A,V,Vout) 25 | do val=1,N 26 | if (int(Vout(val)) .NE. N) then 27 | write(*,*) "Value does not match at",val,int(Vout(val)) 28 | end if 29 | end do 30 | 31 | 32 | deallocate(A) 33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 34 | deallocate(V) 35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 36 | deallocate(Vout) 37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 38 | call system_clock(tj,tk) 39 | 40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 41 | 42 | stop 43 | end 44 | 45 | !------------------------------------------------------- 46 | subroutine gemv(nval,alpha,A,V,Vout) 47 | 48 | USE OMP_LIB 49 | implicit none 50 | 51 | integer:: row,col,A_row 52 | integer:: nval,tid 53 | real(8) :: alpha,sum_val 54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 55 | real(8),intent(out):: Vout(1:nval) 56 | 57 | !$omp parallel do default(shared) private(tid,row,col,A_row,sum_val) 58 | do row=1,nval 59 | !tid=OMP_GET_THREAD_NUM() 60 | sum_val = 0.0 61 | A_row =(row-1)*nval 62 | !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval 63 | do col=1,nval 64 | sum_val = sum_val + A(A_row+col)*V(col) 65 | end do 66 | Vout(row) = sum_val * alpha 67 | end do 68 | !$omp end parallel do 69 | 70 | end subroutine 71 | -------------------------------------------------------------------------------- /hands-on/gemv/02-gemv-omp-target/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/02-gemv-omp-target/gemv-omp-target.cpp: -------------------------------------------------------------------------------- 1 | #define N 8192 2 | #include "timer.h" 3 | 4 | template 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 6 | { 7 | #pragma omp target parallel for map(to : A[:n * n], V[:n]) map(from : Vout[:n]) 8 | for (int row = 0; row < n; row++) 9 | { 10 | T sum = T(0); 11 | const T* __restrict__ A_row = A + row * n; 12 | for (int col = 0; col < n; col++) 13 | sum += A_row[col] * V[col]; 14 | Vout[row] = sum * alpha; 15 | } 16 | } 17 | 18 | template 19 | T* allocate(size_t n) 20 | { 21 | T* ptr = new T[n]; 22 | std::fill_n(ptr, n, T(1)); 23 | return ptr; 24 | } 25 | 26 | template 27 | void deallocate(T* ptr, size_t n) 28 | { 29 | delete[] ptr; 30 | } 31 | 32 | int main() 33 | { 34 | auto* A = allocate(N * N); 35 | auto* V = allocate(N); 36 | auto* Vout = allocate(N); 37 | 38 | { 39 | Timer local("GEMV"); 40 | gemv(N, 1.0f, A, V, Vout); 41 | } 42 | 43 | for (int i = 0; i < N; i++) 44 | if (Vout[i] != N) 45 | { 46 | std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl; 47 | #if defined(THROW_FAIL) 48 | throw; 49 | #else 50 | break; 51 | #endif 52 | } 53 | 54 | deallocate(A, N * N); 55 | deallocate(V, N); 56 | deallocate(Vout, N); 57 | } 58 | -------------------------------------------------------------------------------- /hands-on/gemv/02-gemv-omp-target/gemv-omp-target.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:),V(:),Vout(:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: val 12 | 13 | !!starts here 14 | call system_clock(ti,tk) 15 | allocate(A(1:N*N),stat=err) 16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 17 | allocate(V(1:N),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 19 | allocate(Vout(1:N),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 21 | 22 | A(:) = 1.0 23 | V(:) = 1.0 24 | call gemv(N,alpha,A,V,Vout) 25 | do val=1,N 26 | if (int(Vout(val)) .NE. N) then 27 | write(*,*) "Value does not match at",val,int(Vout(val)) 28 | end if 29 | end do 30 | 31 | 32 | deallocate(A) 33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 34 | deallocate(V) 35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 36 | deallocate(Vout) 37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 38 | call system_clock(tj,tk) 39 | 40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 41 | 42 | stop 43 | end 44 | 45 | !------------------------------------------------------- 46 | subroutine gemv(nval,alpha,A,V,Vout) 47 | 48 | USE OMP_LIB 49 | implicit none 50 | 51 | integer:: row,col,A_row 52 | integer:: nval,tid 53 | real(8) :: alpha,sum_val 54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 55 | real(8),intent(out):: Vout(1:nval) 56 | 57 | !$omp target map(to:A,V) map(from:Vout) 58 | !$omp parallel do default(shared) private(tid,row,col,A_row,sum_val) 59 | do row=1,nval 60 | !tid=OMP_GET_THREAD_NUM() 61 | sum_val = 0.0 62 | A_row =(row-1)*nval 63 | !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval 64 | do col=1,nval 65 | sum_val = sum_val + A(A_row+col)*V(col) 66 | end do 67 | Vout(row) = sum_val * alpha 68 | end do 69 | !$omp end parallel do 70 | !$omp end target 71 | end subroutine 72 | -------------------------------------------------------------------------------- /hands-on/gemv/03-gemv-omp-target-teams/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-teams 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/03-gemv-omp-target-teams/gemv-omp-target-teams.cpp: -------------------------------------------------------------------------------- 1 | #define N 8192 2 | #include "timer.h" 3 | 4 | template 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 6 | { 7 | #pragma omp target teams distribute parallel for map(to : A[:n * n], V[:n]) map(from : Vout[:n]) 8 | for (int row = 0; row < n; row++) 9 | { 10 | T sum = T(0); 11 | const T* __restrict__ A_row = A + row * n; 12 | for (int col = 0; col < n; col++) 13 | sum += A_row[col] * V[col]; 14 | Vout[row] = sum * alpha; 15 | } 16 | } 17 | 18 | template 19 | T* allocate(size_t n) 20 | { 21 | T* ptr = new T[n]; 22 | std::fill_n(ptr, n, T(1)); 23 | //#pragma omp target enter data map(to:ptr[:n]) 24 | return ptr; 25 | } 26 | 27 | template 28 | void deallocate(T* ptr, size_t n) 29 | { 30 | //#pragma omp target exit data map(delete:ptr[:n]) 31 | delete[] ptr; 32 | } 33 | 34 | int main() 35 | { 36 | auto* A = allocate(N * N); 37 | auto* V = allocate(N); 38 | auto* Vout = allocate(N); 39 | 40 | { 41 | Timer local("GEMV"); 42 | gemv(N, 1.0f, A, V, Vout); 43 | } 44 | 45 | for (int i = 0; i < N; i++) 46 | if (Vout[i] != N) 47 | { 48 | std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl; 49 | #if defined(THROW_FAIL) 50 | throw; 51 | #else 52 | break; 53 | #endif 54 | } 55 | 56 | deallocate(A, N * N); 57 | deallocate(V, N); 58 | deallocate(Vout, N); 59 | } 60 | -------------------------------------------------------------------------------- /hands-on/gemv/03-gemv-omp-target-teams/gemv-omp-target-teams.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:),V(:),Vout(:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: val 12 | 13 | !!starts here 14 | call system_clock(ti,tk) 15 | allocate(A(1:N*N),stat=err) 16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 17 | allocate(V(1:N),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 19 | allocate(Vout(1:N),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 21 | 22 | A(:) = 1.0 23 | V(:) = 1.0 24 | call gemv(N,alpha,A,V,Vout) 25 | do val=1,N 26 | if (int(Vout(val)) .NE. N) then 27 | write(*,*) "Value does not match at",val,int(Vout(val)) 28 | end if 29 | end do 30 | 31 | 32 | deallocate(A) 33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 34 | deallocate(V) 35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 36 | deallocate(Vout) 37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 38 | call system_clock(tj,tk) 39 | 40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 41 | 42 | stop 43 | end 44 | 45 | !------------------------------------------------------- 46 | subroutine gemv(nval,alpha,A,V,Vout) 47 | 48 | USE OMP_LIB 49 | implicit none 50 | 51 | integer:: row,col,A_row 52 | integer:: nval,tid 53 | real(8) :: alpha,sum_val 54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 55 | real(8),intent(out):: Vout(1:nval) 56 | 57 | !$omp target teams distribute parallel do & 58 | !$omp map(to:A,V) map(from:Vout) & 59 | !$omp default(shared) private(tid,row,col,A_row,sum_val) 60 | do row=1,nval 61 | !tid=OMP_GET_THREAD_NUM() 62 | sum_val = 0.0 63 | A_row =(row-1)*nval 64 | !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval 65 | do col=1,nval 66 | sum_val = sum_val + A(A_row+col)*V(col) 67 | end do 68 | Vout(row) = sum_val * alpha 69 | end do 70 | !$omp end target teams distribute parallel do 71 | end subroutine 72 | -------------------------------------------------------------------------------- /hands-on/gemv/04-gemv-omp-target-reduction/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-reduction 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/04-gemv-omp-target-reduction/gemv-omp-target-reduction.cpp: -------------------------------------------------------------------------------- 1 | #define N 8192 2 | #include "timer.h" 3 | 4 | template 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 6 | { 7 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) 8 | for (int row = 0; row < n; row++) 9 | { 10 | T sum = T(0); 11 | const T* __restrict__ A_row = A + row * n; 12 | #pragma omp parallel for reduction(+ : sum) 13 | for (int col = 0; col < n; col++) 14 | sum += A_row[col] * V[col]; 15 | Vout[row] = sum * alpha; 16 | } 17 | } 18 | 19 | template 20 | T* allocate(size_t n) 21 | { 22 | T* ptr = new T[n]; 23 | std::fill_n(ptr, n, T(1)); 24 | #pragma omp target enter data map(to : ptr[:n]) 25 | return ptr; 26 | } 27 | 28 | template 29 | void deallocate(T* ptr, size_t n) 30 | { 31 | #pragma omp target exit data map(delete : ptr[:n]) 32 | delete[] ptr; 33 | } 34 | 35 | int main() 36 | { 37 | auto* A = allocate(N * N); 38 | auto* V = allocate(N); 39 | auto* Vout = allocate(N); 40 | 41 | { 42 | Timer local("GEMV"); 43 | gemv(N, 1.0f, A, V, Vout); 44 | } 45 | 46 | #pragma omp target update from(Vout[:N]) 47 | for (int i = 0; i < N; i++) 48 | if (Vout[i] != N) 49 | { 50 | std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl; 51 | #if defined(THROW_FAIL) 52 | throw; 53 | #else 54 | break; 55 | #endif 56 | } 57 | 58 | deallocate(A, N * N); 59 | deallocate(V, N); 60 | deallocate(Vout, N); 61 | } 62 | -------------------------------------------------------------------------------- /hands-on/gemv/04-gemv-omp-target-reduction/gemv-omp-target-reduction.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:),V(:),Vout(:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: val 12 | 13 | !!starts here 14 | call system_clock(ti,tk) 15 | allocate(A(1:N*N),stat=err) 16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 17 | allocate(V(1:N),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 19 | allocate(Vout(1:N),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 21 | 22 | A(:) = 1.0 23 | V(:) = 1.0 24 | call gemv(N,alpha,A,V,Vout) 25 | do val=1,N 26 | if (int(Vout(val)) .NE. N) then 27 | write(*,*) "Value does not match at",val,int(Vout(val)) 28 | end if 29 | end do 30 | 31 | 32 | deallocate(A) 33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 34 | deallocate(V) 35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 36 | deallocate(Vout) 37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 38 | call system_clock(tj,tk) 39 | 40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 41 | 42 | stop 43 | end 44 | 45 | !------------------------------------------------------- 46 | subroutine gemv(nval,alpha,A,V,Vout) 47 | 48 | USE OMP_LIB 49 | implicit none 50 | 51 | integer:: row,col,A_row 52 | integer:: nval,tid 53 | real(8) :: alpha,sum_val 54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 55 | real(8),intent(out):: Vout(1:nval) 56 | 57 | !$omp target teams distribute map(to:A,V) map(from:Vout) private(sum_val) 58 | do row=1,nval 59 | sum_val = 0.0 60 | A_row =(row-1)*nval 61 | !$omp parallel do reduction(+:sum_val) 62 | do col=1,nval 63 | sum_val = sum_val + A(A_row+col)*V(col) 64 | end do 65 | Vout(row) = sum_val * alpha 66 | end do 67 | !$omp end target teams distribute 68 | end subroutine 69 | -------------------------------------------------------------------------------- /hands-on/gemv/05-gemv-omp-target-split-parallel-for-reduction/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-split-parallel-for-reduction 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | #all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/05-gemv-omp-target-split-parallel-for-reduction/gemv-omp-target-split-parallel-for-reduction.cpp: -------------------------------------------------------------------------------- 1 | #define N 8192 2 | #include "timer.h" 3 | 4 | template 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 6 | { 7 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) 8 | for (int row = 0; row < n; row++) 9 | { 10 | T sum = T(0); 11 | const T* __restrict__ A_row = A + row * n; 12 | #pragma omp parallel 13 | { 14 | #pragma omp for reduction(+ : sum) 15 | for (int col = 0; col < n; col++) 16 | sum += A_row[col] * V[col]; 17 | } 18 | Vout[row] = sum * alpha; 19 | } 20 | } 21 | 22 | template 23 | T* allocate(size_t n) 24 | { 25 | T* ptr = new T[n]; 26 | std::fill_n(ptr, n, T(1)); 27 | #pragma omp target enter data map(to : ptr[:n]) 28 | return ptr; 29 | } 30 | 31 | template 32 | void deallocate(T* ptr, size_t n) 33 | { 34 | #pragma omp target exit data map(delete : ptr[:n]) 35 | delete[] ptr; 36 | } 37 | 38 | int main() 39 | { 40 | auto* A = allocate(N * N); 41 | auto* V = allocate(N); 42 | auto* Vout = allocate(N); 43 | 44 | { 45 | Timer local("GEMV"); 46 | gemv(N, 1.0f, A, V, Vout); 47 | } 48 | 49 | #pragma omp target update from(Vout[:N]) 50 | for (int i = 0; i < N; i++) 51 | if (Vout[i] != N) 52 | { 53 | std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl; 54 | #if defined(THROW_FAIL) 55 | throw; 56 | #else 57 | break; 58 | #endif 59 | } 60 | 61 | deallocate(A, N * N); 62 | deallocate(V, N); 63 | deallocate(Vout, N); 64 | } 65 | -------------------------------------------------------------------------------- /hands-on/gemv/51-gemv-omp-many-matrices/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-many-matrices 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/51-gemv-omp-many-matrices/gemv-omp-many-matrices.cpp: -------------------------------------------------------------------------------- 1 | #define N 4096 2 | #include 3 | #include "timer.h" 4 | 5 | template 6 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 7 | { 8 | for (int row = 0; row < n; row++) 9 | { 10 | T sum = T(0); 11 | const T* __restrict__ A_row = A + row * n; 12 | for (int col = 0; col < n; col++) 13 | sum += A_row[col] * V[col]; 14 | Vout[row] = sum * alpha; 15 | } 16 | } 17 | 18 | template 19 | T* allocate(size_t n) 20 | { 21 | T* ptr = new T[n]; 22 | std::fill_n(ptr, n, T(1)); 23 | return ptr; 24 | } 25 | 26 | template 27 | void deallocate(T* ptr, size_t n) 28 | { 29 | delete[] ptr; 30 | } 31 | 32 | int main() 33 | { 34 | std::vector manyA; 35 | std::vector manyV; 36 | std::vector manyVout; 37 | 38 | const int Num_calc = 8; 39 | for (int i = 0; i < Num_calc; i++) 40 | { 41 | manyA.push_back(allocate(N * N)); 42 | manyV.push_back(allocate(N)); 43 | manyVout.push_back(allocate(N)); 44 | } 45 | 46 | { 47 | Timer local("multiGEMV"); 48 | #pragma omp parallel for 49 | for (int i = 0; i < Num_calc; i++) 50 | gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]); 51 | } 52 | 53 | for (int i = 0; i < Num_calc; i++) 54 | { 55 | for (int j = 0; j < N; j++) 56 | if (manyVout[i][j] != N) 57 | { 58 | std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << manyVout[i][j] 59 | << std::endl; 60 | #if defined(THROW_FAIL) 61 | throw; 62 | #else 63 | break; 64 | #endif 65 | } 66 | 67 | deallocate(manyA[i], N * N); 68 | deallocate(manyV[i], N); 69 | deallocate(manyVout[i], N); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /hands-on/gemv/51-gemv-omp-many-matrices/gemv-omp-many-matrices.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:,:),V(:,:),Vout(:,:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: i,val 12 | integer,parameter :: Num_calc=8 13 | 14 | !!starts here 15 | call system_clock(ti,tk) 16 | 17 | allocate(A(1:N*N,1:Num_calc),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 19 | allocate(V(1:N,1:Num_calc),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 21 | allocate(Vout(1:N,1:Num_calc),stat=err) 22 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 23 | 24 | 25 | !$omp parallel do 26 | do i=1,Num_calc 27 | A(:,i) = 1.0 28 | V(:,i) = 1.0 29 | call gemv(N,alpha,A(:,i),V(:,i),Vout(:,i)) 30 | end do 31 | !$omp end parallel do 32 | 33 | do i=1,Num_calc 34 | do val=1,N 35 | if (int(Vout(val,i)) .NE. N) then 36 | write(*,*) "Value does not match at",val,i,int(Vout(val,i)) 37 | end if 38 | end do 39 | end do 40 | 41 | 42 | deallocate(A) 43 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 44 | deallocate(V) 45 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 46 | deallocate(Vout) 47 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 48 | call system_clock(tj,tk) 49 | 50 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 51 | 52 | stop 53 | end 54 | 55 | !------------------------------------------------------- 56 | subroutine gemv(nval,alpha,A,V,Vout) 57 | 58 | USE OMP_LIB 59 | implicit none 60 | 61 | integer:: row,col,A_row 62 | integer:: nval,tid 63 | real(8) :: alpha,sum_val 64 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 65 | real(8),intent(out):: Vout(1:nval) 66 | 67 | do row=1,nval 68 | !tid=OMP_GET_THREAD_NUM() 69 | sum_val = 0.0 70 | A_row =(row-1)*nval 71 | !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval 72 | do col=1,nval 73 | sum_val = sum_val + A(A_row+col)*V(col) 74 | end do 75 | Vout(row) = sum_val * alpha 76 | end do 77 | end subroutine 78 | -------------------------------------------------------------------------------- /hands-on/gemv/52-gemv-omp-target-many-matrices-no-hierachy/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-many-matrices-no-hierachy 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/52-gemv-omp-target-many-matrices-no-hierachy/gemv-omp-target-many-matrices-no-hierachy.cpp: -------------------------------------------------------------------------------- 1 | #define N 4096 2 | #include 3 | #include "timer.h" 4 | 5 | template 6 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 7 | { 8 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) 9 | for (int row = 0; row < n; row++) 10 | { 11 | T sum = T(0); 12 | const T* __restrict__ A_row = A + row * n; 13 | for (int col = 0; col < n; col++) 14 | sum += A_row[col] * V[col]; 15 | Vout[row] = sum * alpha; 16 | } 17 | } 18 | 19 | template 20 | T* allocate(size_t n) 21 | { 22 | T* ptr = new T[n]; 23 | std::fill_n(ptr, n, T(1)); 24 | #pragma omp target enter data map(to : ptr[:n]) 25 | return ptr; 26 | } 27 | 28 | template 29 | void deallocate(T* ptr, size_t n) 30 | { 31 | #pragma omp target exit data map(delete : ptr[:n]) 32 | delete[] ptr; 33 | } 34 | 35 | int main() 36 | { 37 | std::vector manyA; 38 | std::vector manyV; 39 | std::vector manyVout; 40 | 41 | const int Num_calc = 8; 42 | for (int i = 0; i < Num_calc; i++) 43 | { 44 | manyA.push_back(allocate(N * N)); 45 | manyV.push_back(allocate(N)); 46 | manyVout.push_back(allocate(N)); 47 | } 48 | 49 | { 50 | Timer local("multiGEMV"); 51 | #pragma omp parallel for 52 | for (int i = 0; i < Num_calc; i++) 53 | gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]); 54 | } 55 | 56 | for (int i = 0; i < Num_calc; i++) 57 | { 58 | auto* __restrict__ Vout = manyVout[i]; 59 | #pragma omp target update from(Vout[:N]) 60 | for (int j = 0; j < N; j++) 61 | if (Vout[j] != N) 62 | { 63 | std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j] 64 | << std::endl; 65 | #if defined(THROW_FAIL) 66 | throw; 67 | #else 68 | break; 69 | #endif 70 | } 71 | 72 | deallocate(manyA[i], N * N); 73 | deallocate(manyV[i], N); 74 | deallocate(manyVout[i], N); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /hands-on/gemv/52-gemv-omp-target-many-matrices-no-hierachy/gemv-omp-target-many-matrices-no-hierachy.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:,:),V(:,:),Vout(:,:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: i,val 12 | integer,parameter :: Num_calc=8 13 | 14 | !!starts here 15 | call system_clock(ti,tk) 16 | 17 | allocate(A(1:N*N,1:Num_calc),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 19 | allocate(V(1:N,1:Num_calc),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 21 | allocate(Vout(1:N,1:Num_calc),stat=err) 22 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 23 | 24 | 25 | !$omp parallel do 26 | do i=1,Num_calc 27 | A(:,i) = 1.0 28 | V(:,i) = 1.0 29 | call gemv(N,alpha,A(:,i),V(:,i),Vout(:,i)) 30 | end do 31 | !$omp end parallel do 32 | 33 | do i=1,Num_calc 34 | !$omp target update from(Vout(:,i)) 35 | do val=1,N 36 | if (int(Vout(val,i)) .NE. N) then 37 | write(*,*) "Value does not match at",val,i,int(Vout(val,i)) 38 | end if 39 | end do 40 | end do 41 | 42 | 43 | deallocate(A) 44 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 45 | deallocate(V) 46 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 47 | deallocate(Vout) 48 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 49 | call system_clock(tj,tk) 50 | 51 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 52 | 53 | stop 54 | end 55 | 56 | !------------------------------------------------------- 57 | subroutine gemv(nval,alpha,A,V,Vout) 58 | 59 | USE OMP_LIB 60 | implicit none 61 | 62 | integer:: row,col,A_row 63 | integer:: nval,tid 64 | real(8) :: alpha,sum_val 65 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 66 | real(8),intent(out):: Vout(1:nval) 67 | 68 | !$omp target teams distribute map(to:A,V) map(from:Vout) 69 | do row=1,nval 70 | sum_val = 0.0 71 | A_row =(row-1)*nval 72 | do col=1,nval 73 | sum_val = sum_val + A(A_row+col)*V(col) 74 | end do 75 | Vout(row) = sum_val * alpha 76 | end do 77 | !$omp end target teams distribute 78 | end subroutine 79 | -------------------------------------------------------------------------------- /hands-on/gemv/53-gemv-omp-target-many-matrices/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-many-matrices 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/53-gemv-omp-target-many-matrices/gemv-omp-target-many-matrices.cpp: -------------------------------------------------------------------------------- 1 | #define N 4096 2 | #include 3 | #include "timer.h" 4 | 5 | template 6 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 7 | { 8 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) 9 | for (int row = 0; row < n; row++) 10 | { 11 | T sum = T(0); 12 | const T* __restrict__ A_row = A + row * n; 13 | #pragma omp parallel for reduction(+ : sum) 14 | for (int col = 0; col < n; col++) 15 | sum += A_row[col] * V[col]; 16 | Vout[row] = sum * alpha; 17 | } 18 | } 19 | 20 | template 21 | T* allocate(size_t n) 22 | { 23 | T* ptr = new T[n]; 24 | std::fill_n(ptr, n, T(1)); 25 | #pragma omp target enter data map(to : ptr[:n]) 26 | return ptr; 27 | } 28 | 29 | template 30 | void deallocate(T* ptr, size_t n) 31 | { 32 | #pragma omp target exit data map(delete : ptr[:n]) 33 | delete[] ptr; 34 | } 35 | 36 | int main() 37 | { 38 | std::vector manyA; 39 | std::vector manyV; 40 | std::vector manyVout; 41 | 42 | const int Num_calc = 8; 43 | for (int i = 0; i < Num_calc; i++) 44 | { 45 | manyA.push_back(allocate(N * N)); 46 | manyV.push_back(allocate(N)); 47 | manyVout.push_back(allocate(N)); 48 | } 49 | 50 | { 51 | Timer local("multiGEMV"); 52 | #pragma omp parallel for 53 | for (int i = 0; i < Num_calc; i++) 54 | gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]); 55 | } 56 | 57 | for (int i = 0; i < Num_calc; i++) 58 | { 59 | auto* __restrict__ Vout = manyVout[i]; 60 | #pragma omp target update from(Vout[:N]) 61 | for (int j = 0; j < N; j++) 62 | if (Vout[j] != N) 63 | { 64 | std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j] 65 | << std::endl; 66 | #if defined(THROW_FAIL) 67 | throw; 68 | #else 69 | break; 70 | #endif 71 | } 72 | 73 | deallocate(manyA[i], N * N); 74 | deallocate(manyV[i], N); 75 | deallocate(manyVout[i], N); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /hands-on/gemv/53-gemv-omp-target-many-matrices/gemv-omp-target-many-matrices.f90: -------------------------------------------------------------------------------- 1 | program gemv_omp 2 | 3 | USE OMP_LIB 4 | 5 | implicit none 6 | integer,parameter:: N=64 7 | real(8),allocatable :: A(:,:),V(:,:),Vout(:,:) 8 | real(8) :: alpha=1.0 9 | integer :: ti,tj,tk 10 | integer :: err 11 | integer :: i,val 12 | integer,parameter :: Num_calc=8 13 | 14 | !!starts here 15 | call system_clock(ti,tk) 16 | 17 | allocate(A(1:N*N,1:Num_calc),stat=err) 18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err 19 | allocate(V(1:N,1:Num_calc),stat=err) 20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err 21 | allocate(Vout(1:N,1:Num_calc),stat=err) 22 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err 23 | 24 | 25 | !$omp parallel do 26 | do i=1,Num_calc 27 | A(:,i) = 1.0 28 | V(:,i) = 1.0 29 | call gemv(N,alpha,A(:,i),V(:,i),Vout(:,i)) 30 | end do 31 | !$omp end parallel do 32 | 33 | do i=1,Num_calc 34 | !$omp target update from(Vout(:,i)) 35 | do val=1,N 36 | if (int(Vout(val,i)) .NE. N) then 37 | write(*,*) "Value does not match at",val,i,int(Vout(val,i)) 38 | end if 39 | end do 40 | end do 41 | 42 | 43 | deallocate(A) 44 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err 45 | deallocate(V) 46 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err 47 | deallocate(Vout) 48 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err 49 | call system_clock(tj,tk) 50 | 51 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk) 52 | 53 | stop 54 | end 55 | 56 | !------------------------------------------------------- 57 | subroutine gemv(nval,alpha,A,V,Vout) 58 | 59 | USE OMP_LIB 60 | implicit none 61 | 62 | integer:: row,col,A_row 63 | integer:: nval,tid 64 | real(8) :: alpha,sum_val 65 | real(8),intent(in) :: A(1:nval*nval),V(1:nval) 66 | real(8),intent(out):: Vout(1:nval) 67 | 68 | !$omp target teams distribute map(to:A,V) map(from:Vout) 69 | do row=1,nval 70 | sum_val = 0.0 71 | A_row =(row-1)*nval 72 | !$omp parallel do default(shared) private(A_row) reduction(+:sum_val) 73 | do col=1,nval 74 | sum_val = sum_val + A(A_row+col)*V(col) 75 | end do 76 | Vout(row) = sum_val * alpha 77 | end do 78 | !$omp end target teams distribute 79 | end subroutine 80 | -------------------------------------------------------------------------------- /hands-on/gemv/54-gemv-omp-target-many-matrices-multi-devices/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-many-matrices-multi-devices 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | #all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/54-gemv-omp-target-many-matrices-multi-devices/gemv-omp-target-many-matrices-multi-devices.cpp: -------------------------------------------------------------------------------- 1 | #define N 4096 2 | #include 3 | #include "timer.h" 4 | #if defined(_OPEMMP) 5 | #include 6 | #endif 7 | 8 | template 9 | void gemv(int deviceID, int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 10 | { 11 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) device(deviceID) 12 | for (int row = 0; row < n; row++) 13 | { 14 | T sum = T(0); 15 | const T* __restrict__ A_row = A + row * n; 16 | #pragma omp parallel for reduction(+ : sum) 17 | for (int col = 0; col < n; col++) 18 | sum += A_row[col] * V[col]; 19 | Vout[row] = sum * alpha; 20 | } 21 | } 22 | 23 | template 24 | T* allocate(int deviceID, size_t n) 25 | { 26 | T* ptr = new T[n]; 27 | std::fill_n(ptr, n, T(1)); 28 | #pragma omp target enter data map(to : ptr[:n]) device(deviceID) 29 | return ptr; 30 | } 31 | 32 | template 33 | void deallocate(int deviceID, T* ptr, size_t n) 34 | { 35 | #pragma omp target exit data map(delete : ptr[:n]) device(deviceID) 36 | delete[] ptr; 37 | } 38 | 39 | int main() 40 | { 41 | #if defined(_OPEMMP) 42 | const int num_devices = omp_get_num_devices(); 43 | #else 44 | const int num_devices = 1; 45 | #endif 46 | std::cout << "Found " << num_devices << " devices." << std::endl; 47 | 48 | std::vector manyA; 49 | std::vector manyV; 50 | std::vector manyVout; 51 | 52 | const int Num_calc = 8; 53 | for (int i = 0; i < Num_calc; i++) 54 | { 55 | manyA.push_back(allocate(i % num_devices, N * N)); 56 | manyV.push_back(allocate(i % num_devices, N)); 57 | manyVout.push_back(allocate(i % num_devices, N)); 58 | } 59 | 60 | { 61 | Timer local("multiGEMV"); 62 | #pragma omp parallel for 63 | for (int i = 0; i < Num_calc; i++) 64 | gemv(i % num_devices, N, 1.0f, manyA[i], manyV[i], manyVout[i]); 65 | } 66 | 67 | for (int i = 0; i < Num_calc; i++) 68 | { 69 | auto* __restrict__ Vout = manyVout[i]; 70 | #pragma omp target update from(Vout[:N]) device(i % num_devices) 71 | for (int j = 0; j < N; j++) 72 | if (Vout[j] != N) 73 | { 74 | std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j] 75 | << std::endl; 76 | #if defined(THROW_FAIL) 77 | throw; 78 | #else 79 | break; 80 | #endif 81 | } 82 | 83 | deallocate(i % num_devices, manyA[i], N * N); 84 | deallocate(i % num_devices, manyV[i], N); 85 | deallocate(i % num_devices, manyVout[i], N); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /hands-on/gemv/55-gemv-omp-target-many-matrices-taskloop/Makefile: -------------------------------------------------------------------------------- 1 | include ../../make.inc 2 | 3 | name=gemv-omp-target-many-matrices-taskloop 4 | 5 | all_targets= 6 | ifdef ENABLE_C 7 | all_targets += ${name}.c.x 8 | endif 9 | ifdef ENABLE_CXX 10 | all_targets += ${name}.cpp.x 11 | endif 12 | ifdef ENABLE_FC 13 | all_targets += ${name}.f.x 14 | endif 15 | 16 | all: ${all_targets} 17 | 18 | ${name}.cpp.x: ${name}.cpp 19 | ${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@ 20 | 21 | ${name}.f.x: ${name}.f90 22 | ${FC} ${FC_FLAGS} ${FC_OFFLOAD_FLAGS} -o $@ $< && ./$@ 23 | 24 | .PHONY : clean 25 | clean : 26 | rm -f *.x 27 | -------------------------------------------------------------------------------- /hands-on/gemv/55-gemv-omp-target-many-matrices-taskloop/gemv-omp-target-many-matrices-taskloop.cpp: -------------------------------------------------------------------------------- 1 | #define N 256 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | template 7 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 8 | { 9 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) nowait 10 | for (int row = 0; row < n; row++) 11 | { 12 | T sum = T(0); 13 | const T* __restrict__ A_row = A + row * n; 14 | #pragma omp parallel for reduction(+ : sum) 15 | for (int col = 0; col < n; col++) 16 | sum += A_row[col] * V[col]; 17 | Vout[row] = sum * alpha; 18 | } 19 | } 20 | 21 | template 22 | void gemv_host(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout) 23 | { 24 | for (int row = 0; row < n; row++) 25 | { 26 | T sum = T(0); 27 | const T* __restrict__ A_row = A + row * n; 28 | for (int col = 0; col < n; col++) 29 | sum += A_row[col] * V[col]; 30 | Vout[row] = sum * alpha; 31 | } 32 | } 33 | 34 | template 35 | T* allocate(size_t n) 36 | { 37 | T* ptr = new T[n]; 38 | std::fill_n(ptr, n, T(1)); 39 | #pragma omp target enter data map(to : ptr[:n]) 40 | return ptr; 41 | } 42 | 43 | template 44 | void deallocate(T* ptr, size_t n) 45 | { 46 | #pragma omp target exit data map(delete : ptr[:n]) 47 | delete[] ptr; 48 | } 49 | 50 | int main() 51 | { 52 | std::vector manyA; 53 | std::vector manyV; 54 | std::vector manyVout; 55 | 56 | const int Num_calc = 512; 57 | for (int i = 0; i < Num_calc; i++) 58 | { 59 | manyA.push_back(allocate(N * N)); 60 | manyV.push_back(allocate(N)); 61 | manyVout.push_back(allocate(N)); 62 | } 63 | 64 | // warm up 65 | #pragma omp parallel 66 | { 67 | #pragma omp target nowait 68 | { int a = 1; } 69 | } 70 | 71 | { 72 | Timer local("multiGEMV parallel taskloop"); 73 | #pragma omp parallel 74 | #pragma omp single 75 | #pragma omp taskloop 76 | for (int i = 0; i < Num_calc; i++) 77 | if (i%2) 78 | gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]); 79 | else 80 | gemv_host(N/16, 1.0f, manyA[i], manyV[i], manyVout[i]); 81 | } 82 | 83 | for (int i = 0; i < Num_calc; i++) 84 | { 85 | auto* __restrict__ Vout = manyVout[i]; 86 | if (i%2) 87 | { 88 | #pragma omp target update from(Vout[:N]) 89 | for (int j = 0; j < N; j++) 90 | if (Vout[j] != N) 91 | { 92 | std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j] 93 | << std::endl; 94 | #if defined(THROW_FAIL) 95 | throw; 96 | #else 97 | break; 98 | #endif 99 | } 100 | } 101 | 102 | deallocate(manyA[i], N * N); 103 | deallocate(manyV[i], N); 104 | deallocate(manyVout[i], N); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /hands-on/gemv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | set(CXX_SRCS 3 | 01-gemv-omp/gemv-omp.cpp 4 | 02-gemv-omp-target/gemv-omp-target.cpp 5 | 03-gemv-omp-target-teams/gemv-omp-target-teams.cpp 6 | 04-gemv-omp-target-reduction/gemv-omp-target-reduction.cpp 7 | 05-gemv-omp-target-split-parallel-for-reduction/gemv-omp-target-split-parallel-for-reduction.cpp 8 | 51-gemv-omp-many-matrices/gemv-omp-many-matrices.cpp 9 | 52-gemv-omp-target-many-matrices-no-hierachy/gemv-omp-target-many-matrices-no-hierachy.cpp 10 | 53-gemv-omp-target-many-matrices/gemv-omp-target-many-matrices.cpp 11 | 54-gemv-omp-target-many-matrices-multi-devices/gemv-omp-target-many-matrices-multi-devices.cpp 12 | 55-gemv-omp-target-many-matrices-taskloop/gemv-omp-target-many-matrices-taskloop.cpp 13 | ) 14 | 15 | foreach(full_file_name IN ITEMS ${CXX_SRCS}) 16 | get_filename_component(name_only ${full_file_name} NAME_WE) 17 | set(EXE_NAME cxx.${name_only}) 18 | add_executable(${EXE_NAME} ${full_file_name}) 19 | target_compile_definitions(${EXE_NAME} PUBLIC -DTHROW_FAIL) 20 | target_include_directories(${EXE_NAME} PUBLIC ../common) 21 | add_test(NAME ${EXE_NAME} 22 | COMMAND $) 23 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 24 | endforeach() 25 | endif() 26 | -------------------------------------------------------------------------------- /hands-on/gemv/README: -------------------------------------------------------------------------------- 1 | Remeber to create a make.inc under hands-on 2 | Use make.clang-ykt.inc as example 3 | 4 | A sgemv example 5 | 6 | Step 1 7 | omp parallel for 8 | 9 | Step 2 10 | omp target parallel for 11 | using 1 SM 12 | 13 | Step 3 14 | omp target teams distribute parallel for 15 | using all possible SMs 16 | 17 | use enter/exit data 18 | 19 | Step 4 20 | omp target 21 | with reduction 22 | 23 | Step 5 24 | omp parallel for 25 | over many matrices 26 | 27 | Step 6 28 | omp parallel for + target teams distribute 29 | over many matrices to GPU 30 | 31 | Step 7 32 | omp parallel for + target teams distribute + parallel for 33 | over many matrices to GPU 34 | -------------------------------------------------------------------------------- /hands-on/gemv/build_and_run_all.sh: -------------------------------------------------------------------------------- 1 | for folder in [0-9]-* 2 | do 3 | cd $folder 4 | echo ------------ $folder ---------------- 5 | make clean 6 | echo -- CXX -- 7 | make ENABLE_CXX=1 8 | echo -- FC -- 9 | make ENABLE_FC=1 10 | echo 11 | cd .. 12 | done 13 | -------------------------------------------------------------------------------- /hands-on/make.aomp.inc: -------------------------------------------------------------------------------- 1 | CXX=clang++ 2 | CXX_FLAGS=-g -O3 -fopenmp 3 | CXX_OFFLOAD_FLAGS=-fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 4 | 5 | FC=flang 6 | FC_FLAGS=${CXX_FLAGS} 7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS} 8 | -------------------------------------------------------------------------------- /hands-on/make.gcc-nv.inc: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXX_FLAGS=-g -O3 -fopenmp 3 | CXX_OFFLOAD_FLAGS=-foffload=nvptx-none 4 | 5 | FC=gfortran 6 | FC_FLAGS=-fopenmp 7 | FC_OFFLOAD_FLAGS=-foffload=nvptx-none 8 | -------------------------------------------------------------------------------- /hands-on/make.icx.inc: -------------------------------------------------------------------------------- 1 | CXX=icpx 2 | CXX_FLAGS=-g -O2 -fiopenmp 3 | CXX_OFFLOAD_FLAGS=-fopenmp-targets=spir64 4 | 5 | FC=ifx 6 | FC_FLAGS=${CXX_FLAGS} 7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS} 8 | 9 | -------------------------------------------------------------------------------- /hands-on/make.llvm.inc: -------------------------------------------------------------------------------- 1 | CXX=clang++ 2 | CXX_FLAGS=-g -O3 -fopenmp 3 | CXX_OFFLOAD_FLAGS=-fopenmp-targets=nvptx64 4 | 5 | FC=flang 6 | FC_FLAGS=${CXX_FLAGS} 7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS} 8 | -------------------------------------------------------------------------------- /hands-on/make.xl.inc: -------------------------------------------------------------------------------- 1 | CXX=xlC_r 2 | CXX_FLAGS=-g -O3 -qsmp=omp 3 | CXX_OFFLOAD_FLAGS=-qoffload 4 | 5 | FC=xlf90_r 6 | FC_FLAGS=${CXX_FLAGS} 7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS} 8 | -------------------------------------------------------------------------------- /integration/crusher_recipe/README: -------------------------------------------------------------------------------- 1 | WORKSPACE=`pwd` 2 | 3 | # set up compiler wrappers 4 | module load cmake/3.22.2 5 | module load cray-fftw 6 | module load openblas/0.3.17-omp 7 | module load boost/1.77.0-cxx17 8 | # private module until OLCF provides MPI compiler wrappers for afar compilers. 9 | module use $WORKSPACE/modules 10 | module load mpiwrappers/cray-mpich-afar 11 | module load cray-hdf5-parallel 12 | 13 | # clone qmcpack source 14 | git clone --shallow-since=2022-08-01 https://github.com/QMCPACK/qmcpack.git 15 | git co d029f1f2c976c39486b5122cd81566f93afb2461 16 | 17 | # prepare test h5 files. Files are not small. Download once and soft link them when needed. 18 | mkdir -p $WORKSPACE/QMCDATA/NiO 19 | cd $WORKSPACE/QMCDATA/NiO 20 | curl -L -O -J https://m.box.com/file/284382973675/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c 21 | curl -L -O -J https://m.box.com/file/284381326469/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c 22 | curl -L -O -J https://m.box.com/file/284383098200/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c 23 | curl -L -O -J https://m.box.com/file/130886492400/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c 24 | curl -L -O -J https://m.box.com/file/130890136573/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c 25 | curl -L -O -J https://m.box.com/file/178686464361/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c 26 | cd - 27 | 28 | # build qmcpack 29 | mkdir build_crusher_afar_offload_cuda2hip_real_MP 30 | cd build_crusher_afar_offload_cuda2hip_real_MP 31 | 32 | cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_MIXED_PRECISION=ON \ 33 | -DENABLE_OFFLOAD=ON -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON -DHIP_ARCH=gfx90a \ 34 | -DMPIEXEC_EXECUTABLE=`which srun` -DQMC_DATA=$WORKSPACE/QMCDATA ../qmcpack 35 | 36 | # grab a node 37 | salloc -A MAT189 -t 00:60:00 -N 1 38 | 39 | # hdf5 workaround 40 | export HDF5_USE_FILE_LOCKING=FALSE 41 | 42 | # run all the deterministic tests 43 | ctest -R deter -j32 44 | 45 | # run performance tests. test files are inside the build directory 46 | # please run each of them as an individual test because test may hang due to bugs in AMD runtime. 47 | 48 | cd tests/performance/NiO/dmc-a4-e48-DU8-batched_driver 49 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S1-dmc.xml 50 | cd - 51 | 52 | cd tests/performance/NiO/dmc-a8-e96-DU16-batched_driver 53 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S2-dmc.xml 54 | cd - 55 | 56 | cd tests/performance/NiO/dmc-a16-e192-DU16-batched_driver 57 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S4-dmc.xml 58 | cd - 59 | 60 | cd tests/performance/NiO/dmc-a32-e384-DU32-batched_driver 61 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S8-dmc.xml 62 | cd - 63 | 64 | cd tests/performance/NiO/dmc-a64-e768-DU32-batched_driver 65 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S16-dmc.xml 66 | cd - 67 | 68 | cd tests/performance/NiO/dmc-a512-e6144-DU64-cpu_driver 69 | srun --gpus-per-task 1 -c 16 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S128-dmc.xml 70 | cd - 71 | -------------------------------------------------------------------------------- /integration/crusher_recipe/modules/cray-mpich-afar.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | 3 | file cray-mpich-llvm module 4 | This is not from Cray. 5 | ]]-- 6 | 7 | conflict("mpiwrappers") 8 | unload("PrgEnv-cray") 9 | unload("PrgEnv-gnu") 10 | unload("PrgEnv-amd") 11 | unload("PrgEnv-cray-amd") 12 | 13 | load("gcc") 14 | load("afar") 15 | load("craype") 16 | load("cray-mpich") 17 | load("cray-pals") 18 | 19 | prepend_path("PATH", pathJoin(os.getenv("MPICH_DIR"), "bin"), ":") 20 | 21 | setenv("MPICH_CC", "clang") 22 | setenv("MPICH_CXX", "clang++") 23 | setenv("MPICH_FC", "flang") 24 | setenv("MPICH_F77", "flang") 25 | setenv("MPICH_F90", "flang") 26 | 27 | setenv("ROCM_PATH", pathJoin(os.getenv("OLCF_AFAR_ROOT"))) 28 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(TESTS_BINDIR ${CMAKE_CURRENT_BINARY_DIR}/bin) 2 | 3 | add_subdirectory(allocator) 4 | add_subdirectory(complex) 5 | add_subdirectory(global_variable) 6 | add_subdirectory(linking) 7 | add_subdirectory(math) 8 | add_subdirectory(private) 9 | add_subdirectory(target_task) 10 | add_subdirectory(omphost) 11 | add_subdirectory(reduction) 12 | add_subdirectory(implict_async) 13 | add_subdirectory(map) 14 | 15 | if (ENABLE_Fortran) 16 | add_subdirectory(fortran_use_device_ptr) 17 | add_subdirectory(fortran_allocator) 18 | endif() 19 | -------------------------------------------------------------------------------- /tests/allocator/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX AND ENABLE_EXPERIMENTAL) 2 | set(FULLNAME omp_pteam_mem_alloc) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 5 | set_target_properties(${EXE_NAME} PROPERTIES 6 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 7 | add_test(NAME ${EXE_NAME} 8 | COMMAND $ 9 | WORKING_DIRECTORY ${TESTS_BINDIR}) 10 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 11 | endif() 12 | -------------------------------------------------------------------------------- /tests/allocator/omp_pteam_mem_alloc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NTEAM 32 4 | #define TEAM_SIZE 128 5 | 6 | template 7 | void compute_prefactor(int team_id, T base[2]) 8 | { 9 | base[0] = team_id; 10 | base[1] = team_id * 2; 11 | } 12 | 13 | bool failed = false; 14 | 15 | template 16 | void test_omp_pteam_mem_alloc() 17 | { 18 | T sum[NTEAM]; 19 | #pragma omp target teams distribute map(from:sum[:NTEAM]) 20 | for(int team_id = 0; team_id < NTEAM; team_id++) 21 | { 22 | T local_sum = 0; 23 | T base[2]; 24 | #pragma omp allocate(base) allocator(omp_pteam_mem_alloc) 25 | compute_prefactor(team_id, base); 26 | #pragma omp parallel for reduction(+: local_sum) 27 | for(int tid = 0; tid < TEAM_SIZE; tid++) 28 | local_sum += base[0] + tid; 29 | sum[team_id] = local_sum; 30 | } 31 | for(int team_id = 0; team_id < NTEAM; team_id++) 32 | if (sum[team_id] != team_id * TEAM_SIZE + (TEAM_SIZE -1) * TEAM_SIZE / 2 ) 33 | { 34 | std::cout << "sum[" << team_id << "] = " << sum[team_id] << " ref " << team_id * TEAM_SIZE + (TEAM_SIZE -1) * TEAM_SIZE / 2 << std::endl; 35 | failed = true; 36 | } 37 | } 38 | 39 | int main() 40 | { 41 | test_omp_pteam_mem_alloc(); 42 | return failed; 43 | } 44 | -------------------------------------------------------------------------------- /tests/complex/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | foreach(NAME complex_reduction_cpu complex_reduction complex) 3 | set(FULLNAME ${NAME}) 4 | set(EXE_NAME cxx.${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | 15 | if (ENABLE_Fortran) 16 | set(FULLNAME complex) 17 | set(EXE_NAME f.${FULLNAME}) 18 | add_executable(${EXE_NAME} ${FULLNAME}.f90) 19 | set_target_properties(${EXE_NAME} PROPERTIES 20 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 21 | add_test(NAME ${EXE_NAME} 22 | COMMAND $ 23 | WORKING_DIRECTORY ${TESTS_BINDIR}) 24 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran) 25 | endif() 26 | -------------------------------------------------------------------------------- /tests/complex/complex.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | bool failed = false; 5 | 6 | template 7 | void test_map() 8 | { 9 | std::complex a(0.2, 1), a_check; 10 | #pragma omp target map(from:a_check) 11 | { 12 | a_check = a; 13 | } 14 | 15 | if (std::abs(a - a_check) > 1e-6) 16 | { 17 | std::cout << "wrong map value check" << a_check << " correct value " << a << std::endl; 18 | failed = true; 19 | } 20 | } 21 | 22 | template 23 | void test_plus(AT a, BT b) 24 | { 25 | std::complex c, c_host; 26 | 27 | c_host = a + b; 28 | #pragma omp target map(from:c) 29 | { 30 | c = a + b; 31 | } 32 | 33 | if (std::abs(c - c_host) > 1e-6) 34 | { 35 | std::cout << "wrong operator + value check" << c << " correct value " << c_host << std::endl; 36 | failed = true; 37 | } 38 | } 39 | 40 | template 41 | void test_minus(AT a, BT b) 42 | { 43 | std::complex c, c_host; 44 | 45 | c_host = a - b; 46 | #pragma omp target map(from:c) 47 | { 48 | c = a - b; 49 | } 50 | 51 | if (std::abs(c - c_host) > 1e-6) 52 | { 53 | std::cout << "wrong operator - value check" << c << " correct value " << c_host << std::endl; 54 | failed = true; 55 | } 56 | } 57 | 58 | template 59 | void test_mul(AT a, BT b) 60 | { 61 | std::complex c, c_host; 62 | 63 | c_host = a * b; 64 | #pragma omp target map(from:c) 65 | { 66 | c = a * b; 67 | } 68 | 69 | if (std::abs(c - c_host) > 1e-6) 70 | { 71 | std::cout << "wrong operator * value check" << c << " correct value " << c_host << std::endl; 72 | failed = true; 73 | } 74 | } 75 | 76 | template 77 | void test_div(AT a, BT b) 78 | { 79 | std::complex c, c_host; 80 | 81 | c_host = a / b; 82 | #pragma omp target map(from:c) 83 | { 84 | c = a / b; 85 | } 86 | 87 | if (std::abs(c - c_host) > 1e-6) 88 | { 89 | std::cout << "wrong operator / value check" << c << " correct value " << c_host << std::endl; 90 | failed = true; 91 | } 92 | } 93 | 94 | template 95 | void test_complex() 96 | { 97 | test_map(); 98 | 99 | test_plus(std::complex(0, 1), std::complex(0.5, 0.3)); 100 | test_plus(std::complex(0, 1), T(0.5)); 101 | test_plus(T(0.5), std::complex(0, 1)); 102 | 103 | test_minus(std::complex(0, 1), std::complex(0.5, 0.3)); 104 | test_minus(std::complex(0, 1), T(0.5)); 105 | test_minus(T(0.5), std::complex(0, 1)); 106 | 107 | test_mul(std::complex(0, 1), std::complex(0.5, 0.3)); 108 | test_mul(std::complex(0, 1), T(0.5)); 109 | test_mul(T(0.5), std::complex(0, 1)); 110 | 111 | test_div(std::complex(0, 1), std::complex(0.5, 0.3)); 112 | test_div(std::complex(0, 1), T(0.5)); 113 | test_div(T(0.5), std::complex(0, 1)); 114 | } 115 | 116 | int main() 117 | { 118 | std::cout << "Testing float" << std::endl; 119 | test_complex(); 120 | std::cout << "Testing double" << std::endl; 121 | test_complex(); 122 | return failed; 123 | } 124 | -------------------------------------------------------------------------------- /tests/complex/complex.f90: -------------------------------------------------------------------------------- 1 | program test_complex 2 | implicit none 3 | complex :: a, b, c 4 | 5 | a = cmplx(0, 1) 6 | b = cmplx(0.5, 0.3) 7 | !$omp target map(from: c) 8 | c = a*b 9 | !$omp end target 10 | 11 | if (abs(c - a*b) > 1e-7) then 12 | print *, "wrong value ", c, "right value ", a*b 13 | stop 1 14 | endif 15 | end program test_complex 16 | -------------------------------------------------------------------------------- /tests/complex/complex_reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | bool failed = false; 5 | 6 | template 7 | void test_map() 8 | { 9 | std::complex a(0.2, 1), a_check; 10 | #pragma omp target map(from : a_check) 11 | { 12 | a_check = a; 13 | } 14 | 15 | if (std::abs(a - a_check) > 1e-6) 16 | { 17 | std::cout << "wrong map value check" << a_check << " correct value " << a << std::endl; 18 | failed = true; 19 | } 20 | } 21 | 22 | #if !defined(__NO_UDR) 23 | #pragma omp declare reduction(+ : std::complex : omp_out += omp_in) 24 | #pragma omp declare reduction(+ : std::complex : omp_out += omp_in) 25 | #endif 26 | 27 | template 28 | class initiator 29 | { 30 | public: 31 | static T value(int i) { return T(i); } 32 | }; 33 | 34 | template 35 | class initiator> 36 | { 37 | public: 38 | static std::complex value(int i) { return {T(i), T(-i)}; } 39 | }; 40 | 41 | template 42 | void test_reduction() 43 | { 44 | T sum(0), sum_host(0); 45 | const int size = 100; 46 | T array[size]; 47 | for (int i = 0; i < size; i++) 48 | { 49 | array[i] = initiator::value(i); 50 | sum_host += array[i]; 51 | } 52 | 53 | #pragma omp target teams distribute parallel for map(to : array[:size]) reduction(+ : sum) 54 | for (int i = 0; i < size; i++) 55 | sum += array[i]; 56 | 57 | if (std::abs(sum - sum_host) > 1e-6) 58 | { 59 | std::cout << "wrong reduction value check" << sum << " correct value " << sum_host << std::endl; 60 | failed = true; 61 | } 62 | 63 | const int nblock(10), block_size(10); 64 | T block_sum[nblock]; 65 | #pragma omp target teams distribute map(to : array[:size]) map(from : block_sum[:nblock]) 66 | for (int ib = 0; ib < nblock; ib++) 67 | { 68 | T partial_sum(0); 69 | const int istart = ib * block_size; 70 | const int iend = (ib + 1) * block_size; 71 | #pragma omp parallel for reduction(+ : partial_sum) 72 | for (int i = istart; i < iend; i++) 73 | partial_sum += array[i]; 74 | block_sum[ib] = partial_sum; 75 | } 76 | 77 | sum = 0; 78 | for (int ib = 0; ib < nblock; ib++) 79 | sum += block_sum[ib]; 80 | if (std::abs(sum - sum_host) > 1e-6) 81 | { 82 | std::cout << "hierarchical parallelism wrong reduction value check" << sum << " correct value " << sum_host 83 | << std::endl; 84 | failed = true; 85 | } 86 | } 87 | 88 | template 89 | void test_real() 90 | { 91 | test_reduction(); 92 | } 93 | 94 | template 95 | void test_complex() 96 | { 97 | test_map(); 98 | test_reduction>(); 99 | } 100 | 101 | int main() 102 | { 103 | std::cout << "Testing real" << std::endl; 104 | std::cout << "Testing float" << std::endl; 105 | test_real(); 106 | std::cout << "Testing double" << std::endl; 107 | test_real(); 108 | 109 | std::cout << "Testing complex" << std::endl; 110 | std::cout << "Testing float" << std::endl; 111 | test_complex(); 112 | std::cout << "Testing double" << std::endl; 113 | test_complex(); 114 | return failed; 115 | } 116 | -------------------------------------------------------------------------------- /tests/complex/complex_reduction_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | bool failed = false; 5 | 6 | #if !defined(__NO_UDR) 7 | #pragma omp declare reduction(+: std::complex: omp_out += omp_in) 8 | #pragma omp declare reduction(+: std::complex: omp_out += omp_in) 9 | #endif 10 | 11 | template 12 | void test_reduction() 13 | { 14 | T sum(0), sum_host(0); 15 | const int size = 100; 16 | T array[size]; 17 | for (int i = 0; i < size; i++) 18 | { 19 | array[i] = T(i); 20 | sum_host += array[i]; 21 | } 22 | 23 | #pragma omp parallel for reduction(+: sum) 24 | for (int i = 0; i < size; i++) 25 | sum += array[i]; 26 | 27 | if (std::abs(sum - sum_host) > 1e-6) 28 | { 29 | std::cout << "wrong reduction value check" << sum << " correct value " << sum_host << std::endl; 30 | failed = true; 31 | } 32 | } 33 | 34 | int main() 35 | { 36 | test_reduction(); 37 | test_reduction>(); 38 | test_reduction(); 39 | test_reduction>(); 40 | return failed; 41 | } 42 | -------------------------------------------------------------------------------- /tests/cudafor_omp/README.md: -------------------------------------------------------------------------------- 1 | Case 1 2 | ``` 3 | nvfortran -cuda noomp.f90 4 | ./a.out 5 | ``` 6 | good 7 | 8 | Case 2 9 | ``` 10 | nvfortran -cuda -mp=gpu noomp.f90 11 | ./a.out 12 | ``` 13 | SegFault 14 | 15 | Case 3 16 | ``` 17 | nvfortran -cuda -mp=gpu omp_below.f90 18 | ``` 19 | Compiler error 20 | ``` 21 | NVFORTRAN-S-1050-Non-tightly nested loop in cuf kernels do at nest 1 (omp_below.f90: 18) 22 | NVFORTRAN-S-0155-Kernel region ignored; no parallel loops (omp_below.f90: 17) 23 | 0 inform, 0 warnings, 2 severes, 0 fatal for repro 24 | NVFORTRAN-S-1058-Call to PGI runtime function not supported - pgf90_dev_copyout (omp_below.f90: 28) 25 | ``` 26 | 27 | Case 4 28 | ``` 29 | nvfortran -cuda -mp=gpu omp_above.f90 30 | ``` 31 | Compiler error 32 | ``` 33 | NVFORTRAN-S-1058-Call to PGI runtime function not supported - pgf90_dev_copyout (omp_above.f90: 20) 34 | ``` 35 | -------------------------------------------------------------------------------- /tests/cudafor_omp/noomp.f90: -------------------------------------------------------------------------------- 1 | module test 2 | implicit none 3 | contains 4 | SUBROUTINE repro() 5 | use cudafor 6 | IMPLICIT NONE 7 | integer, device, allocatable :: nh_d2(:),ityp_d2(:) 8 | integer :: na,i 9 | integer :: nat,npw,ierr 10 | integer :: np,nh_np,ih 11 | nat = 1 12 | npw = 1050 13 | allocate(ityp_d2(1:nat)) 14 | ityp_d2 = 5 15 | allocate(nh_d2(1:1000)) 16 | nh_d2 = 3 17 | !$cuf kernel do(2) <<<*,*>>> 18 | DO na =1, nat 19 | DO i = 1, npw 20 | np = ityp_d2(na) 21 | nh_np = nh_d2(np) 22 | ENDDO 23 | ENDDO 24 | 25 | deallocate(ityp_d2,nh_d2) 26 | ! 27 | END SUBROUTINE repro 28 | end module test 29 | 30 | program main 31 | use test 32 | implicit none 33 | call repro 34 | end program 35 | -------------------------------------------------------------------------------- /tests/cudafor_omp/omp_above.f90: -------------------------------------------------------------------------------- 1 | module test 2 | implicit none 3 | contains 4 | SUBROUTINE repro() 5 | use cudafor 6 | IMPLICIT NONE 7 | integer, device, allocatable :: nh_d2(:),ityp_d2(:) 8 | integer :: na,i 9 | integer :: nat,npw,ierr 10 | integer :: np,nh_np,ih 11 | nat = 1 12 | npw = 1050 13 | allocate(ityp_d2(1:nat)) 14 | ityp_d2 = 5 15 | allocate(nh_d2(1:1000)) 16 | nh_d2 = 3 17 | !$omp target teams distribute parallel do collapse(2) 18 | DO na =1, nat 19 | DO i = 1, npw 20 | np = ityp_d2(na) 21 | nh_np = nh_d2(np) 22 | ENDDO 23 | ENDDO 24 | 25 | !$cuf kernel do(2) <<<*,*>>> 26 | DO na =1, nat 27 | DO i = 1, npw 28 | np = ityp_d2(na) 29 | nh_np = nh_d2(np) 30 | ENDDO 31 | ENDDO 32 | 33 | deallocate(ityp_d2,nh_d2) 34 | ! 35 | END SUBROUTINE repro 36 | end module test 37 | 38 | program main 39 | use test 40 | implicit none 41 | call repro 42 | end program 43 | -------------------------------------------------------------------------------- /tests/cudafor_omp/omp_below.f90: -------------------------------------------------------------------------------- 1 | module test 2 | implicit none 3 | contains 4 | SUBROUTINE repro() 5 | use cudafor 6 | IMPLICIT NONE 7 | integer, device, allocatable :: nh_d2(:),ityp_d2(:) 8 | integer :: na,i 9 | integer :: nat,npw,ierr 10 | integer :: np,nh_np,ih 11 | nat = 1 12 | npw = 1050 13 | allocate(ityp_d2(1:nat)) 14 | ityp_d2 = 5 15 | allocate(nh_d2(1:1000)) 16 | nh_d2 = 3 17 | !$cuf kernel do(2) <<<*,*>>> 18 | DO na =1, nat 19 | DO i = 1, npw 20 | np = ityp_d2(na) 21 | nh_np = nh_d2(np) 22 | ENDDO 23 | ENDDO 24 | 25 | !$omp target teams distribute parallel do collapse(2) 26 | DO na =1, nat 27 | DO i = 1, npw 28 | np = ityp_d2(na) 29 | nh_np = nh_d2(np) 30 | ENDDO 31 | ENDDO 32 | 33 | deallocate(ityp_d2,nh_d2) 34 | ! 35 | END SUBROUTINE repro 36 | end module test 37 | 38 | program main 39 | use test 40 | implicit none 41 | call repro 42 | end program 43 | -------------------------------------------------------------------------------- /tests/fortran_allocator/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(array_class dualspace.f90) 2 | 3 | foreach(NAME device device_isptr resize) 4 | set(FULLNAME dualspace_array_${NAME}) 5 | set(EXE_NAME f.${FULLNAME}) 6 | add_executable(${EXE_NAME} ${FULLNAME}.f90) 7 | target_link_libraries(${EXE_NAME} PUBLIC array_class) 8 | set_target_properties(${EXE_NAME} PROPERTIES 9 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 10 | add_test(NAME ${EXE_NAME} 11 | COMMAND $ 12 | WORKING_DIRECTORY ${TESTS_BINDIR}) 13 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran) 14 | endforeach() 15 | -------------------------------------------------------------------------------- /tests/fortran_allocator/dualspace.f90: -------------------------------------------------------------------------------- 1 | module dualspace 2 | use iso_c_binding 3 | use iso_fortran_env 4 | implicit none 5 | 6 | type dualspace_base_type 7 | character, dimension(:), pointer :: data_f => NULL () 8 | integer(int64) :: data_length = 0 9 | type(c_ptr) :: data_ptr = c_null_ptr 10 | contains 11 | procedure :: resize_base 12 | final :: deallocate_data 13 | end type 14 | 15 | type, extends(dualspace_base_type) :: dualspace_double_type 16 | integer :: dims = 0 17 | contains 18 | procedure :: resize => resize_double 19 | procedure :: data => data_double 20 | end type 21 | 22 | type, extends(dualspace_base_type) :: dualspace_cplx_double_type 23 | integer :: dims = 0 24 | contains 25 | procedure :: resize => resize_cplx_double 26 | procedure :: data => data_cplx_double 27 | end type 28 | 29 | private :: resize_base, allocate_data, deallocate_data 30 | private :: resize_double, resize_cplx_double 31 | private :: data_double, data_cplx_double 32 | contains 33 | subroutine resize_base(self, bytes) 34 | implicit none 35 | class(dualspace_base_type), intent(inout) :: self 36 | integer(int64), intent(in) :: bytes 37 | 38 | if (self%data_length .ne. bytes) then 39 | call deallocate_data(self) 40 | call allocate_data(self, bytes) 41 | endif 42 | 43 | self%data_ptr = C_LOC(self%data_f) 44 | end subroutine 45 | 46 | subroutine allocate_data(self, bytes) 47 | implicit none 48 | type(dualspace_base_type), intent(inout) :: self 49 | integer(int64), intent(in) :: bytes 50 | if (bytes > 0) then 51 | allocate(self%data_f(bytes)) 52 | !$omp target enter data map(alloc: self%data_f) 53 | write(*,*) "allocate_data size ", bytes 54 | endif 55 | self%data_length = bytes 56 | end subroutine 57 | 58 | subroutine deallocate_data(self) 59 | implicit none 60 | type(dualspace_base_type), intent(inout) :: self 61 | if (self%data_length > 0) then 62 | write(*,*) "deallocate_data size ", self%data_length 63 | !$omp target exit data map(delete: self%data_f) 64 | deallocate(self%data_f) 65 | endif 66 | self%data_length = 0 67 | end subroutine 68 | 69 | subroutine resize_double(self, num) 70 | class(dualspace_double_type), intent(inout) :: self 71 | integer, intent(in) :: num 72 | real(real64) :: dummy 73 | call self%resize_base(num * sizeof(dummy)) 74 | self%dims = num 75 | end subroutine 76 | 77 | subroutine resize_cplx_double(self, num) 78 | class(dualspace_cplx_double_type), intent(inout) :: self 79 | integer, intent(in) :: num 80 | complex(real64) :: dummy 81 | call self%resize_base(num * sizeof(dummy)) 82 | self%dims = num 83 | end subroutine 84 | 85 | function data_double(self) result(res) 86 | class(dualspace_double_type), intent(inout) :: self 87 | real(real64), dimension(:), pointer :: res 88 | 89 | call C_F_POINTER(self%data_ptr, res, shape=[self%dims]) 90 | end function 91 | 92 | function data_cplx_double(self) result(res) 93 | class(dualspace_cplx_double_type), intent(inout) :: self 94 | complex(real64), dimension(:), pointer :: res 95 | 96 | call C_F_POINTER(self%data_ptr, res, shape=[self%dims]) 97 | end function 98 | end module 99 | -------------------------------------------------------------------------------- /tests/fortran_allocator/dualspace_array_device.f90: -------------------------------------------------------------------------------- 1 | subroutine test 2 | use dualspace 3 | implicit none 4 | type(dualspace_double_type) :: abc 5 | real(8), dimension(:), pointer :: abc_data 6 | integer, parameter :: Ntotal = 1000 7 | integer :: i, Nsum 8 | 9 | call abc%resize(Ntotal) 10 | 11 | ! initialize values 12 | abc_data => abc%data() 13 | !$omp target teams distribute parallel do map(always, from:abc_data) 14 | do i = 1, Ntotal 15 | abc_data(i) = i 16 | enddo 17 | 18 | ! do a sum 19 | Nsum = 0 20 | !$omp target teams distribute parallel do reduction(+: Nsum) 21 | do i = 1, Ntotal 22 | Nsum = Nsum + abc_data(i) 23 | enddo 24 | 25 | write(*,*) "Nsum = ", Nsum 26 | 27 | if (Nsum /= 500500) stop 1 28 | !write(*,*) "end of subroutine" 29 | end subroutine test 30 | 31 | program main 32 | call test() 33 | !write(*,*) "end of program" 34 | end program main 35 | -------------------------------------------------------------------------------- /tests/fortran_allocator/dualspace_array_device_isptr.f90: -------------------------------------------------------------------------------- 1 | subroutine sum_on_device(array, array_size) 2 | implicit none 3 | integer, intent(in) :: array_size 4 | real(kind = 8), intent(in) :: array(1:array_size) 5 | integer :: i, Nsum 6 | 7 | ! do a sum 8 | Nsum = 0 9 | !$omp target teams distribute parallel do reduction(+: Nsum) has_device_addr(array) 10 | do i = 1, array_size 11 | Nsum = Nsum + array(i) 12 | enddo 13 | 14 | write(*,*) "Nsum = ", Nsum 15 | 16 | if (Nsum /= 500500) stop 1 17 | end subroutine 18 | 19 | subroutine test 20 | use dualspace 21 | implicit none 22 | type(dualspace_double_type) :: abc 23 | real(8), dimension(:), pointer :: abc_data 24 | integer, parameter :: Ntotal = 1000 25 | integer :: i 26 | 27 | call abc%resize(Ntotal) 28 | abc_data => abc%data() 29 | 30 | ! initialize values 31 | !$omp target teams distribute parallel do map(always, from:abc_data) 32 | do i = 1, Ntotal 33 | abc_data(i) = i 34 | enddo 35 | 36 | !$omp target data use_device_addr(abc_data) 37 | call sum_on_device(abc_data, size(abc_data)) 38 | !$omp end target data 39 | 40 | !write(*,*) "end of subroutine" 41 | end subroutine test 42 | 43 | program main 44 | call test() 45 | !write(*,*) "end of program" 46 | end program main 47 | -------------------------------------------------------------------------------- /tests/fortran_allocator/dualspace_array_resize.f90: -------------------------------------------------------------------------------- 1 | subroutine fill_density(density) 2 | use dualspace, only: dualspace_double_type, dualspace_cplx_double_type 3 | implicit none 4 | type(dualspace_double_type), intent(inout) :: density 5 | real(8), dimension(:), pointer :: density_data 6 | integer, parameter :: test_size =20 7 | integer :: i 8 | 9 | call density%resize(test_size) 10 | density_data => density%data() 11 | write(*,*) "density_data size ", size(density_data) 12 | do i = 1, test_size 13 | density_data(i) = 1. 14 | enddo 15 | !$omp target update to(density_data) 16 | !$omp target teams distribute parallel do map(always, from: density_data) 17 | do i = 1, test_size 18 | density_data(i) = density_data(i) + i * 1. 19 | enddo 20 | 21 | if (density_data(3).ne.4.) stop 1 22 | end subroutine 23 | 24 | program abc 25 | use dualspace, only: dualspace_double_type, dualspace_cplx_double_type 26 | implicit none 27 | 28 | type(dualspace_double_type) :: density, density2 29 | real(8), dimension(:), pointer :: density_data 30 | call density%resize(10) 31 | density_data => density%data() 32 | write(*,*) "density_data size ", size(density_data) 33 | 34 | call fill_density(density2) 35 | density_data => density2%data() 36 | write(*,*) "density_data(3) should be 3. Current value ", density_data(3) 37 | 38 | block 39 | type(dualspace_cplx_double_type) :: wf 40 | complex(8), dimension(:), pointer :: wf_data 41 | call wf%resize(30) 42 | wf_data => wf%data() 43 | write(*,*) "wf_data size ", size(wf_data) 44 | end block 45 | end program 46 | -------------------------------------------------------------------------------- /tests/fortran_use_device_ptr/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(FULLNAME use_device_ptr_target) 2 | set(EXE_NAME f.${FULLNAME}) 3 | add_executable(${EXE_NAME} ${FULLNAME}.f90) 4 | set_target_properties(${EXE_NAME} PROPERTIES 5 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 6 | add_test(NAME ${EXE_NAME} 7 | COMMAND $ 8 | WORKING_DIRECTORY ${TESTS_BINDIR}) 9 | set_tests_properties(${EXE_NAME} PROPERTIES 10 | LABELS fortran) 11 | -------------------------------------------------------------------------------- /tests/fortran_use_device_ptr/use_device_ptr_target.f90: -------------------------------------------------------------------------------- 1 | program test_use_device_ptr 2 | implicit none 3 | double precision :: alpha 4 | integer, parameter :: lda = 10 5 | double precision, allocatable :: mat(:, :) 6 | 7 | allocate(mat(lda, lda)) 8 | call dgemm(lda, mat) 9 | 10 | contains 11 | subroutine dgemm(lda, a) 12 | implicit none 13 | integer :: lda 14 | double precision, target:: a(lda,lda) ! need target attribute to use c_loc 15 | !$omp target data use_device_addr(a) map(a) 16 | !call cublas_dgemm('T','N',M,N,K,alpha,c_loc(A),LDA,c_loc(b) +,LDB,beta,c_loc(c),LDC) 17 | !$omp end target data 18 | end subroutine 19 | end program 20 | -------------------------------------------------------------------------------- /tests/global_variable/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(global_static) 2 | add_subdirectory(constexpr) 3 | add_subdirectory(global_pointer) 4 | -------------------------------------------------------------------------------- /tests/global_variable/constexpr/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | set(FULLNAME constexpr) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 5 | set_target_properties(${EXE_NAME} PROPERTIES 6 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 7 | add_test(NAME ${EXE_NAME} 8 | COMMAND $ 9 | WORKING_DIRECTORY ${TESTS_BINDIR}) 10 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 11 | endif() 12 | -------------------------------------------------------------------------------- /tests/global_variable/constexpr/constexpr.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #define LENGTH 2 3 | int main() 4 | { 5 | constexpr double h_chebyshev_coefs[LENGTH] = { 0, 2.1 }; 6 | #pragma omp target enter data map(to:h_chebyshev_coefs[0:LENGTH]) 7 | #pragma omp target 8 | { 9 | printf("print in target %lf %lf\n", h_chebyshev_coefs[0], h_chebyshev_coefs[1]); 10 | } 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /tests/global_variable/global_pointer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX AND CXX_OFFLOAD_RUNTIME_OKAY) 2 | set(FULLNAME global_pointer) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_executable(${EXE_NAME} main.cpp global.cpp) 5 | set_target_properties(${EXE_NAME} PROPERTIES 6 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 7 | add_test(NAME ${EXE_NAME} 8 | COMMAND $ 9 | WORKING_DIRECTORY ${TESTS_BINDIR}) 10 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 11 | endif() 12 | -------------------------------------------------------------------------------- /tests/global_variable/global_pointer/Makefile: -------------------------------------------------------------------------------- 1 | MPILER = amd 2 | 3 | ifeq ($(COMPILER),intel) 4 | CC = icpx 5 | CFLAGS = -Wall -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 6 | endif 7 | 8 | ifeq ($(COMPILER),ibm) 9 | CC = xlc_r 10 | CFLAGS = -Wall -qsmp=omp -qoffload 11 | endif 12 | 13 | ifeq ($(COMPILER),llvm) 14 | CC = clang++ 15 | CFLAGS = -Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 -g 16 | endif 17 | 18 | ifeq ($(COMPILER),amd) 19 | CC = clang++ 20 | CFLAGS = -Wall -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 21 | endif 22 | 23 | program = test 24 | 25 | source = main.cpp global.cpp 26 | 27 | obj = $(source:.cpp=.o) 28 | 29 | deps = Makefile 30 | 31 | $(program): $(obj) $(deps) 32 | $(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS) 33 | 34 | %.o: %.cpp $(deps) 35 | $(CC) $(CFLAGS) -c $< -o $@ 36 | 37 | clean: 38 | rm -rf $(program) $(obj) 39 | 40 | edit: 41 | vim -p $(source) $(deps) 42 | -------------------------------------------------------------------------------- /tests/global_variable/global_pointer/global.cpp: -------------------------------------------------------------------------------- 1 | #include "global.h" 2 | 3 | #pragma omp declare target 4 | int * device_arr; 5 | #pragma omp end declare target 6 | -------------------------------------------------------------------------------- /tests/global_variable/global_pointer/global.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_H 2 | #define GLOBAL_H 3 | 4 | #pragma omp declare target 5 | extern int * device_arr; 6 | #pragma omp end declare target 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /tests/global_variable/global_pointer/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "global.h" 5 | 6 | void foo(int i) 7 | { 8 | device_arr[i] *= 2; 9 | } 10 | 11 | int main(void) 12 | { 13 | int host_id = omp_get_initial_device(); 14 | int device_id = omp_get_default_device(); 15 | 16 | int N = 5; 17 | 18 | // Allocate and initialize host array 19 | size_t sz = N * sizeof(int); 20 | int * host_arr = (int *) malloc(sz); 21 | for( int i = 0; i < N; i++ ) 22 | { 23 | host_arr[i] = i; 24 | } 25 | 26 | // Allocate device array and copy data from host -> device 27 | device_arr = (int *) omp_target_alloc(sz, device_id); 28 | omp_target_memcpy(device_arr, host_arr, sz, 0, 0, device_id, host_id); 29 | #pragma omp target update to(device_arr) 30 | 31 | // Execute device kernel 32 | #pragma omp target teams distribute parallel for 33 | for( int i = 0; i < N; i++) 34 | { 35 | foo(i); 36 | } 37 | 38 | // Copy data from device -> host 39 | omp_target_memcpy(host_arr, device_arr, sz, 0, 0, host_id, device_id); 40 | 41 | // Return non-zero error code if we failed 42 | return host_arr[4] != 8; 43 | } 44 | -------------------------------------------------------------------------------- /tests/global_variable/global_static/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | set(FULLNAME global_static) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_executable(${EXE_NAME} main.cpp data.cpp) 5 | set_target_properties(${EXE_NAME} PROPERTIES 6 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 7 | add_test(NAME ${EXE_NAME} 8 | COMMAND $ 9 | WORKING_DIRECTORY ${TESTS_BINDIR}) 10 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 11 | endif() 12 | -------------------------------------------------------------------------------- /tests/global_variable/global_static/Makefile: -------------------------------------------------------------------------------- 1 | CXX=icpx 2 | OFFLOAD_FLAGS=-fiopenmp -fopenmp-targets=spir64 3 | 4 | .PHONY: clean 5 | 6 | a.out: main.cpp data.o 7 | ${CXX} ${OFFLOAD_FLAGS} main.cpp data.o 8 | data.o: data.cpp data.hpp 9 | ${CXX} ${OFFLOAD_FLAGS} -c data.cpp 10 | 11 | clean: 12 | rm data.o a.out 13 | -------------------------------------------------------------------------------- /tests/global_variable/global_static/data.cpp: -------------------------------------------------------------------------------- 1 | #include "data.hpp" 2 | 3 | #pragma omp declare target 4 | template <> 5 | const float engine::params[4] = {1.0f, 2.0f, 3.0f, 4.0f}; 6 | 7 | template <> 8 | const double engine::params[4] = {1.0, 2.0, 3.0, 4.0}; 9 | #pragma omp end declare target 10 | -------------------------------------------------------------------------------- /tests/global_variable/global_static/data.hpp: -------------------------------------------------------------------------------- 1 | template 2 | class engine 3 | { 4 | public: 5 | static const T params[4]; 6 | }; 7 | 8 | template <> 9 | const float engine::params[4]; 10 | 11 | template <> 12 | const double engine::params[4]; 13 | -------------------------------------------------------------------------------- /tests/global_variable/global_static/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "data.hpp" 3 | 4 | template 5 | void check() 6 | { 7 | T params_check[4]; 8 | engine mine; 9 | 10 | #pragma omp target map(from:params_check[:4]) 11 | { 12 | for(int i=0; i<4; i++) 13 | params_check[i] = mine.params[i]; 14 | } 15 | assert(params_check[0] == T(1.0)); 16 | assert(params_check[1] == T(2.0)); 17 | assert(params_check[2] == T(3.0)); 18 | assert(params_check[3] == T(4.0)); 19 | } 20 | 21 | int main() 22 | { 23 | check(); 24 | check(); 25 | } 26 | -------------------------------------------------------------------------------- /tests/implict_async/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") 2 | foreach(NAME llvm_alloc_host llvm_alloc_host_data) 3 | set(FULLNAME ${NAME}) 4 | set(EXE_NAME cxx.${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | -------------------------------------------------------------------------------- /tests/implict_async/llvm_alloc_host.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #ifndef __clang_major__ 5 | #error Need clang extension 6 | #endif 7 | 8 | // expose header free extensions 9 | extern "C" { 10 | void *llvm_omp_target_alloc_host(size_t, int); 11 | void llvm_omp_target_free_host(void *, int); 12 | } 13 | 14 | int main() { 15 | const int N = 64; 16 | const auto default_device = omp_get_default_device(); 17 | #pragma omp target device(default_device) 18 | { int a = N; } 19 | 20 | int* hst_ptr = (int*) llvm_omp_target_alloc_host(N * sizeof(int), default_device); 21 | 22 | for (int i = 0; i < N; ++i) 23 | hst_ptr[i] = 2; 24 | 25 | #pragma omp target teams distribute parallel for device(default_device) map(tofrom : hst_ptr[: N]) 26 | for (int i = 0; i < N; ++i) 27 | hst_ptr[i] -= 1; 28 | 29 | int sum = 0; 30 | for (int i = 0; i < N; ++i) 31 | sum += hst_ptr[i]; 32 | 33 | llvm_omp_target_free_host(hst_ptr, default_device); 34 | 35 | if (sum == N) 36 | std::cout << "Correct Sum" << std::endl; 37 | else 38 | std::cout << "Wrong Sum " << sum << "! It should be " << N << std::endl; 39 | 40 | return 0; 41 | } 42 | 43 | -------------------------------------------------------------------------------- /tests/implict_async/llvm_alloc_host_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #ifndef __clang_major__ 5 | #error Need clang extension 6 | #endif 7 | 8 | // expose header free extensions 9 | extern "C" { 10 | void *llvm_omp_target_alloc_host(size_t, int); 11 | void llvm_omp_target_free_host(void *, int); 12 | } 13 | 14 | int main() { 15 | const int N = 64; 16 | const auto default_device = omp_get_default_device(); 17 | #pragma omp target device(default_device) 18 | { int a = N; } 19 | 20 | int* hst_ptr = (int*) llvm_omp_target_alloc_host(N * sizeof(int), default_device); 21 | #pragma omp target enter data device(default_device) map(alloc : hst_ptr[: N]) 22 | 23 | for (int i = 0; i < N; ++i) 24 | hst_ptr[i] = 2; 25 | 26 | #pragma omp target teams distribute parallel for device(default_device) map(always, tofrom : hst_ptr[: N]) 27 | for (int i = 0; i < N; ++i) 28 | hst_ptr[i] -= 1; 29 | 30 | int sum = 0; 31 | for (int i = 0; i < N; ++i) 32 | sum += hst_ptr[i]; 33 | 34 | #pragma omp target exit data device(default_device) map(delete : hst_ptr[: N]) 35 | llvm_omp_target_free_host(hst_ptr, default_device); 36 | 37 | if (sum == N) 38 | std::cout << "Correct Sum" << std::endl; 39 | else 40 | std::cout << "Wrong Sum " << sum << "! It should be " << N << std::endl; 41 | 42 | return 0; 43 | } 44 | 45 | -------------------------------------------------------------------------------- /tests/linking/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(link_static_fat_bin) 2 | add_subdirectory(linker_outlined_func) 3 | add_subdirectory(two_identical_templates) 4 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | set(FULLNAME link_static_fat_bin) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_library(mylib classA.cpp) 5 | add_executable(${EXE_NAME} main.cpp) 6 | target_link_libraries(${EXE_NAME} mylib) 7 | set_target_properties(${EXE_NAME} PROPERTIES 8 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 9 | add_test(NAME ${EXE_NAME} 10 | COMMAND $ 11 | WORKING_DIRECTORY ${TESTS_BINDIR}) 12 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 13 | endif() 14 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/classA.cpp: -------------------------------------------------------------------------------- 1 | #include "classA.h" 2 | 3 | template 4 | void tester::run() 5 | { 6 | #pragma omp target 7 | { 8 | T a; 9 | } 10 | } 11 | 12 | template class tester; 13 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/classA.h: -------------------------------------------------------------------------------- 1 | template 2 | class tester 3 | { 4 | public: 5 | void run(); 6 | }; 7 | 8 | extern template class tester; 9 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/compile-amd.sh: -------------------------------------------------------------------------------- 1 | clang++ -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 -c classA.cpp 2 | rm -f libmy.a 3 | llvm-ar qc libmy.a classA.o 4 | llvm-ranlib libmy.a 5 | clang++ -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 main.cpp libmy.a 6 | ./a.out 7 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/compile-x86.sh: -------------------------------------------------------------------------------- 1 | clang++ -fopenmp -fopenmp-targets=x86_64-pc-linux-gnu -c classA.cpp 2 | rm -f libmy.a 3 | ar qc libmy.a classA.o 4 | ranlib libmy.a 5 | clang++ -fopenmp -fopenmp-targets=x86_64-pc-linux-gnu main.cpp -L. -lmy 6 | ./a.out 7 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/compile.sh: -------------------------------------------------------------------------------- 1 | clang++ -fopenmp -fopenmp-targets=nvptx64 -c classA.cpp 2 | rm -f libmy.a 3 | ar qc libmy.a classA.o 4 | ranlib libmy.a 5 | clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp libmy.a 6 | ./a.out 7 | -------------------------------------------------------------------------------- /tests/linking/link_static_fat_bin/main.cpp: -------------------------------------------------------------------------------- 1 | #include "classA.h" 2 | 3 | int main() 4 | { 5 | tester A; 6 | A.run(); 7 | return 0; 8 | } 9 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | set(FULLNAME linker_outlined_function_collision) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_executable(${EXE_NAME} main.cpp a.cpp b.cpp) 5 | set_target_properties(${EXE_NAME} PROPERTIES 6 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 7 | add_test(NAME ${EXE_NAME} 8 | COMMAND $ 9 | WORKING_DIRECTORY ${TESTS_BINDIR}) 10 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 11 | endif() 12 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/a.cpp: -------------------------------------------------------------------------------- 1 | #include "ab.h" 2 | #include "compute.h" 3 | void a() 4 | { 5 | const int N = 1000; 6 | #pragma omp target 7 | { 8 | float A[N]; 9 | compute(A, N); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/ab.h: -------------------------------------------------------------------------------- 1 | void a(); 2 | void b(); 3 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/b.cpp: -------------------------------------------------------------------------------- 1 | #include "ab.h" 2 | #include "compute.h" 3 | void b() 4 | { 5 | const int N = 1000; 6 | #pragma omp target 7 | { 8 | float A[N]; 9 | compute(A, N); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/compile.sh: -------------------------------------------------------------------------------- 1 | CXX=xlC_r 2 | CXX_FLAGS="-qsmp=omp -qoffload" 3 | 4 | $CXX $CXX_FLAGS -c a.cpp 5 | $CXX $CXX_FLAGS -c b.cpp 6 | $CXX $CXX_FLAGS main.cpp a.o b.o 7 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/compute.h: -------------------------------------------------------------------------------- 1 | inline void compute(float* a, int size) 2 | { 3 | #pragma omp parallel for 4 | for (int i = 0; i < size; i++) 5 | a[i] *= 2.0f; 6 | } 7 | -------------------------------------------------------------------------------- /tests/linking/linker_outlined_func/main.cpp: -------------------------------------------------------------------------------- 1 | #include "ab.h" 2 | int main() 3 | { 4 | a(); 5 | b(); 6 | return 0; 7 | } 8 | -------------------------------------------------------------------------------- /tests/linking/missing_bundles/boo.cpp: -------------------------------------------------------------------------------- 1 | int boo() 2 | { return 0; } 3 | -------------------------------------------------------------------------------- /tests/linking/missing_bundles/compile.sh: -------------------------------------------------------------------------------- 1 | clang++ -fopenmp -fopenmp-targets=nvptx64 -c foo.cpp 2 | clang++ -fopenmp -c boo.cpp 3 | clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp boo.o foo.o 4 | #nvlink fatal : Could not open input file '/tmp/foo-7e0588.cubin' 5 | #clang-14: error: nvlink command failed with exit code 1 (use -v to see invocation) 6 | -------------------------------------------------------------------------------- /tests/linking/missing_bundles/foo.cpp: -------------------------------------------------------------------------------- 1 | int foo() 2 | { return 0; } 3 | -------------------------------------------------------------------------------- /tests/linking/missing_bundles/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int foo(); 4 | int boo(); 5 | 6 | int main() 7 | { 8 | foo(); 9 | boo(); 10 | } 11 | -------------------------------------------------------------------------------- /tests/linking/two_identical_templates/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | set(FULLNAME linker_identical_template) 3 | set(EXE_NAME cxx.${FULLNAME}) 4 | add_executable(${EXE_NAME} main.cpp test_a.cpp test_b.cpp) 5 | set_target_properties(${EXE_NAME} PROPERTIES 6 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 7 | add_test(NAME ${EXE_NAME} 8 | COMMAND $ 9 | WORKING_DIRECTORY ${TESTS_BINDIR}) 10 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 11 | endif() 12 | -------------------------------------------------------------------------------- /tests/linking/two_identical_templates/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void test_a(); 5 | void test_b(); 6 | 7 | int main() 8 | { 9 | test_a(); 10 | test_b(); 11 | } 12 | -------------------------------------------------------------------------------- /tests/linking/two_identical_templates/test_a.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | void test_map() 6 | { 7 | std::cout << "map(complex<>)" << std::endl; 8 | std::complex a(0.2, 1), a_check; 9 | #pragma omp target map(from : a_check) 10 | { 11 | a_check = a; 12 | } 13 | } 14 | 15 | void test_a() 16 | { 17 | test_map(); 18 | } 19 | -------------------------------------------------------------------------------- /tests/linking/two_identical_templates/test_b.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | void test_map() 6 | { 7 | std::cout << "map(complex<>)" << std::endl; 8 | std::complex a(0.2, 1), a_check; 9 | #pragma omp target map(from : a_check) 10 | { 11 | a_check = a; 12 | } 13 | } 14 | 15 | void test_b() 16 | { 17 | test_map(); 18 | } 19 | -------------------------------------------------------------------------------- /tests/map/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX AND CXX_OFFLOAD_RUNTIME_OKAY) 2 | foreach(NAME pointer_api this_with_virtual struct_with_const first_private_this_wrong) 3 | set(FULLNAME ${NAME}) 4 | set(EXE_NAME cxx.map_${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | 15 | if (ENABLE_Fortran) 16 | foreach(NAME implicit_map_alloc) 17 | set(FULLNAME ${NAME}) 18 | set(EXE_NAME f.map_${FULLNAME}) 19 | add_executable(${EXE_NAME} ${FULLNAME}.f90) 20 | target_link_libraries(${EXE_NAME} dummy_openmp_runtime) 21 | set_target_properties(${EXE_NAME} PROPERTIES 22 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 23 | add_test(NAME ${EXE_NAME} 24 | COMMAND $ 25 | WORKING_DIRECTORY ${TESTS_BINDIR}) 26 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran) 27 | endforeach() 28 | endif() 29 | -------------------------------------------------------------------------------- /tests/map/check_transfer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() 4 | { 5 | int in = 1; 6 | int out = 0; 7 | int n = 2; 8 | for(int i = 0; i < n; i++) 9 | { 10 | #pragma omp target map(from:out) 11 | { 12 | out = in * 2; 13 | } 14 | } 15 | assert( out == in * 2 ); 16 | } 17 | -------------------------------------------------------------------------------- /tests/map/declare_target_global.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #pragma omp declare target 6 | int * arr; 7 | 8 | void foo(int i) 9 | { 10 | printf("device address %d %p\n", i, arr); 11 | } 12 | #pragma omp end declare target 13 | 14 | int main(void) 15 | { 16 | // Allocate array and set to zero 17 | int len = 3; 18 | arr = (int *) calloc( len, sizeof(int) ); 19 | 20 | printf("arr omp_target_is_present %d\n", omp_target_is_present(arr, 0)); 21 | #pragma omp target data use_device_ptr(arr) 22 | { 23 | printf("arr initial device address %p\n", arr); 24 | } 25 | 26 | #pragma omp target data map(tofrom: arr[:len]) 27 | { 28 | printf("arr host address %p\n", arr); 29 | printf("arr omp_target_is_present %d\n", omp_target_is_present(arr, 0)); 30 | #pragma omp target data use_device_ptr(arr) 31 | { 32 | printf("arr device address inside map %p\n", arr); 33 | } 34 | 35 | #pragma omp target teams distribute parallel for 36 | for( int i = 0; i < len; i++) 37 | foo(i); 38 | } 39 | 40 | printf("arr omp_target_is_present %d\n", omp_target_is_present(arr, 0)); 41 | #pragma omp target data use_device_ptr(arr) 42 | { 43 | printf("arr final device address %p\n", arr); 44 | } 45 | 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /tests/map/first_private_this_wrong.cpp: -------------------------------------------------------------------------------- 1 | //#include 2 | #include 3 | 4 | template struct base { 5 | T abc[20]; 6 | T de[200]; 7 | T compute() { return de[0]; } 8 | }; 9 | 10 | template struct foo : protected base { 11 | foo() { 12 | #pragma omp target enter data map(to : this[:1]) 13 | } 14 | ~foo() { 15 | #pragma omp target exit data map(delete : this[:1]) 16 | } 17 | 18 | void target_compute() { 19 | //std::cout << " ***** is this[:1] mapped? " << omp_target_is_present(this, omp_get_default_device()) << std::endl; 20 | #pragma omp target teams 21 | #pragma omp parallel 22 | { T a = base::compute(); } 23 | } 24 | }; 25 | 26 | int main() { 27 | foo a; 28 | a.target_compute(); 29 | } 30 | -------------------------------------------------------------------------------- /tests/map/implicit_map_alloc.f90: -------------------------------------------------------------------------------- 1 | program main 2 | implicit none 3 | type foobar 4 | real(8), dimension(:), pointer :: foo, bar 5 | end type 6 | type(foobar) :: this 7 | integer, parameter :: n = 1024 8 | integer :: i 9 | real(8), dimension(:), pointer :: bar_ptr 10 | 11 | allocate(this%foo(n), this%bar(n)) 12 | this%foo = 1d0 13 | !$omp target enter data map(to:this%foo) map(alloc:this%bar) 14 | 15 | bar_ptr => this%bar 16 | !$omp target teams distribute parallel do 17 | do i = 1,n 18 | bar_ptr(i) = 3d0 19 | enddo 20 | !$omp end target teams distribute parallel do 21 | 22 | !$omp target update from(this%bar) 23 | 24 | if (all(this%foo < this%bar)) then 25 | print *,"Success!" 26 | else 27 | write(*,*) this%foo(3), this%bar(1), this%bar(2) 28 | stop 1 29 | endif 30 | 31 | end program main 32 | -------------------------------------------------------------------------------- /tests/map/map_class_member.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template 4 | struct maptest 5 | { 6 | constexpr static int size = 6; 7 | T data[size]; 8 | 9 | maptest() 10 | { 11 | std::cout << "before enter data\n"; 12 | #pragma omp target enter data map(alloc:data[:6]) 13 | std::cout << "done with enter data\n"; 14 | } 15 | 16 | ~maptest() 17 | { 18 | std::cout << "before exit data\n"; 19 | #pragma omp target exit data map(delete:data[:6]) 20 | std::cout << "done with exit data\n"; 21 | } 22 | }; 23 | 24 | int main() 25 | { 26 | maptest a; 27 | } 28 | -------------------------------------------------------------------------------- /tests/map/map_delete_inside_data.cpp: -------------------------------------------------------------------------------- 1 | int main() 2 | { 3 | int a[100]; 4 | #pragma omp target enter data map(alloc:a) 5 | #pragma omp target data map(alloc:a) 6 | { 7 | #pragma omp target exit data map(delete:a) 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /tests/map/map_threads.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | const int N = 200000000; 7 | std::vector vec(N, 1.1); 8 | float* vec_ptr = vec.data(); 9 | for (int it = 0; it < 3; it++) 10 | { 11 | printf("\niteration %d\n", it); 12 | #pragma omp parallel for 13 | for (int i = 0; i < 4; i++) 14 | { 15 | // first hit does the transfer, others start executing the kernel. 16 | #pragma omp target map(to:vec_ptr[:vec.size()]) 17 | { 18 | printf("tid %d value = %f\n", i, vec_ptr[N - 1 - i]); 19 | } 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/map/pointer_api.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | int a[1000]; 7 | std::cout << "before enter data mapped? " << omp_target_is_present(a, omp_get_default_device()) << std::endl; 8 | #pragma omp target enter data map(alloc:a[:1000]) 9 | std::cout << "after enter data mapped? " << omp_target_is_present(a, omp_get_default_device()) << std::endl; 10 | std::cout << "&a[0] mapped? " << omp_target_is_present(&a[0], omp_get_default_device()) << std::endl; 11 | std::cout << "&a[50] mapped? " << omp_target_is_present(&a[50], omp_get_default_device()) << std::endl; 12 | std::cout << "&a[999] mapped? " << omp_target_is_present(&a[999], omp_get_default_device()) << std::endl; 13 | std::cout << "&a[1000] mapped? " << omp_target_is_present(&a[1000], omp_get_default_device()) << std::endl; 14 | 15 | int* a_ptr = a; 16 | int* b_ptr = a + 200; 17 | 18 | std::cout << "host pointer " << std::endl 19 | << "a = " << a_ptr << std::endl 20 | << "b = " << b_ptr << std::endl; 21 | 22 | #pragma omp target data use_device_ptr(a_ptr, b_ptr) 23 | { 24 | std::cout << "device pointer " << std::endl 25 | << "a = " << a_ptr << std::endl 26 | << "b = " << b_ptr << std::endl; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/map/struct_with_const.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct with_const 6 | { 7 | with_const() 8 | { 9 | #pragma omp target enter data map(to:this[:1]) 10 | } 11 | 12 | constexpr static int size = 6; 13 | static const int b = 12; 14 | std::string name; 15 | }; 16 | 17 | int main() 18 | { 19 | with_const foo; 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /tests/map/this_with_virtual.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | class Base 5 | { 6 | public: 7 | virtual void foo() const = 0; 8 | }; 9 | 10 | class Derived: public Base 11 | { 12 | const int const_value = 8; 13 | 14 | public: 15 | Derived() 16 | { 17 | #pragma omp target enter data map(to: this[:1]) 18 | } 19 | 20 | ~Derived() 21 | { 22 | #pragma omp target exit data map(delete: this[:1]) 23 | } 24 | 25 | void foo() const override {} 26 | 27 | int boo() 28 | { 29 | int res = 0; 30 | #pragma omp target map(from:res) 31 | { 32 | res = const_value; 33 | } 34 | return res; 35 | } 36 | }; 37 | 38 | int main() 39 | { 40 | Derived a; 41 | const int res = a.boo(); 42 | std::cout << "return value " << res << " reference value " << 8 << std::endl; 43 | assert(res == 8); 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /tests/math/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | foreach(NAME FP_ZERO header_only modf modf_team sqrt_simd sin_cos sin_simd sincos sincos_simd sincos_simd_template modf_in_branch) 3 | set(FULLNAME ${NAME}) 4 | set(EXE_NAME cxx.math_${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | -------------------------------------------------------------------------------- /tests/math/FP_ZERO.cpp: -------------------------------------------------------------------------------- 1 | //===--- qmcpack_target_math.c --- math lib invocation inside target---------===// 2 | // 3 | // OpenMP API Version 4.5 Nov 2015 4 | // 5 | // 6 | ////===----------------------------------------------------------------------===// 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #define N 1000 13 | bool failed = false; 14 | 15 | void test_math_lib_inside_target() { 16 | 17 | double array[N]; 18 | int errors = 0; 19 | 20 | // Array initialization 21 | for (int i = 0; i < N; ++i) { 22 | array[i] = 0.99; 23 | } 24 | 25 | int c99_zero = FP_ZERO; 26 | 27 | #pragma omp target map(tofrom: array[0:N]) 28 | for (int i = 0; i < N; ++i) { 29 | array[i] = pow((double)i,2.0); 30 | } 31 | 32 | for (int i = 0; i < N; ++i) { 33 | if(fabs(array[i] - pow((double)i,2)) >= 0.000009) 34 | { 35 | std::cout << "failed array[" << i << "] " << array[i] << " ref " << pow((double)i,2) << std::endl; 36 | failed = true; 37 | } 38 | } 39 | } 40 | 41 | int main() { 42 | test_math_lib_inside_target(); 43 | return failed; 44 | } 45 | -------------------------------------------------------------------------------- /tests/math/README: -------------------------------------------------------------------------------- 1 | All three cases work if only calling C functions directly by adding -DC_ONLY flag. 2 | clang++ -fopenmp -fopenmp-targets=nvptx64 -DC_ONLY sincos.cpp 3 | 4 | When the code becomes more C++, 5 | 6 | $ clang++ -fopenmp -fopenmp-targets=nvptx64 sincos.cpp 7 | nvlink error : Undefined reference to '_ZL6sincosdPdS_' in '/tmp/sincos-b90610.cubin' 8 | clang-11: error: nvlink command failed with exit code 255 (use -v to see invocation) 9 | 10 | $ clang++ -fopenmp -fopenmp-targets=nvptx64 sin_cos.cpp 11 | fatal error: error in backend: Cannot select: t11: f32 = fsin t10 12 | t10: f32,ch = load<(dereferenceable load 4 from %ir.__x.addr)> t9, FrameIndex:i64<0>, undef:i64 13 | t8: i64 = FrameIndex<0> 14 | t3: i64 = undef 15 | In function: _ZSt3sinf 16 | clang-11: error: clang frontend command failed with exit code 70 (use -v to see invocation) 17 | 18 | $ clang++ -fopenmp -fopenmp-targets=nvptx64 modf.cpp 19 | nvlink error : Undefined reference to '_ZL4modfdPd' in '/tmp/modf-796c89.cubin' 20 | nvlink error : Undefined reference to 'modff' in '/tmp/modf-796c89.cubin' 21 | clang-11: error: nvlink command failed with exit code 255 (use -v to see invocation) 22 | -------------------------------------------------------------------------------- /tests/math/header_only.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main() 4 | { 5 | printf("SUCCESS\n"); 6 | return 0; 7 | } 8 | -------------------------------------------------------------------------------- /tests/math/modf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template 4 | void test_modf(T x) 5 | { 6 | T dx; 7 | int intx; 8 | 9 | #pragma omp target map(from: intx, dx) 10 | { 11 | T ipart; 12 | dx = std::modf(x, &ipart); 13 | intx = static_cast(ipart); 14 | } 15 | } 16 | 17 | int main() 18 | { 19 | 20 | #if !defined(C_ONLY) 21 | test_modf(1.0); 22 | test_modf(1.0); 23 | #endif 24 | 25 | #pragma omp target 26 | { 27 | double intpart, res; 28 | res = modf(1.1, &intpart); 29 | } 30 | 31 | #pragma omp target 32 | { 33 | float intpart, res; 34 | res = modff(1.1f, &intpart); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /tests/math/modf_in_branch.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | template 6 | int foo(T r, T DeltaRInv) 7 | { 8 | r *= DeltaRInv; 9 | T ipart; 10 | //printf("modf input %lf ptr %p \n", r, &ipart); 11 | const T t = std::modf(r, &ipart); 12 | const int i = (int)ipart; 13 | return i; 14 | } 15 | 16 | int main() 17 | { 18 | int arr[20]; 19 | #pragma omp target teams distribute map(arr) 20 | for(int i = 0; i < 2; i++) 21 | { 22 | double r = 1.3; 23 | double DeltaRInv = 0.3; 24 | #pragma omp parallel for 25 | for(int j = 0; j < 10; j++) 26 | { 27 | if (r * j > 5) 28 | arr[i*10 + j] = foo(r * j, DeltaRInv); 29 | else 30 | arr[i*10 + j] = 0; 31 | } 32 | } 33 | 34 | for(int i = 0; i < 2; i++) 35 | { 36 | double r = 1.3; 37 | double DeltaRInv = 0.3; 38 | for(int j = 0; j < 10; j++) 39 | if (r * j > 5) 40 | { 41 | if (arr[i*10 + j] != foo(r * j, DeltaRInv)) throw std::runtime_error("Wrong foo return value!"); 42 | } 43 | else 44 | { 45 | if (arr[i*10 + j] != 0) throw std::runtime_error("should be zero!"); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/math/modf_team.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | template 6 | inline void getSplineBound(T x, TRESIDUAL& dx, int& ind, int nmax) 7 | { 8 | // lower bound 9 | if (x < 0) 10 | { 11 | ind = 0; 12 | dx = T(0); 13 | } 14 | else 15 | { 16 | #if defined(USE_FLOOR) 17 | T ipart = std::floor(x); 18 | dx = x - ipart; 19 | #else 20 | T ipart; 21 | dx = std::modf(x, &ipart); 22 | #endif 23 | ind = static_cast(ipart); 24 | // upper bound 25 | if (ind > nmax) 26 | { 27 | ind = nmax; 28 | dx = T(1) - std::numeric_limits::epsilon(); 29 | } 30 | } 31 | } 32 | 33 | int main() 34 | { 35 | using T = float; 36 | T x = 1.25; 37 | T dx = 0; 38 | int ind = 0; 39 | #pragma omp target map(from : dx, ind) 40 | { 41 | getSplineBound(x, dx, ind, 10); 42 | } 43 | 44 | if (x != T(dx + ind)) 45 | { 46 | std::cout << "Error x = " << x << " dx = " << dx << " ind " << ind << std::endl; 47 | return 1; 48 | } 49 | std::cout << "omp target passed!" << std::endl; 50 | 51 | constexpr int N = 100; 52 | T x_arr[N]; 53 | T dx_arr[N]; 54 | int ind_arr[N]; 55 | for (int i = 0; i < N; i++) 56 | x_arr[i] = (i + 1) * 0.25; 57 | 58 | #pragma omp target teams distribute map(to : x_arr[:N]) map(from : dx_arr[:N], ind_arr[:N]) 59 | for (int i = 0; i < N; i++) 60 | getSplineBound(x_arr[i], dx_arr[i], ind_arr[i], 24); 61 | 62 | for (int i = 0; i < N - 1; i++) 63 | if (x_arr[i] != T(dx_arr[i] + ind_arr[i])) 64 | { 65 | std::cout << "Error team = " << i << " x = " << x_arr[i] << " dx = " << dx_arr[i] << " ind " << ind_arr[i] 66 | << std::endl; 67 | return 1; 68 | } 69 | 70 | //special case 71 | { 72 | const int i = N - 1; 73 | if (ind_arr[i] != 24) 74 | { 75 | std::cout << "Error team = " << i << " x = " << x_arr[i] << " dx = " << dx_arr[i] << " ind " << ind_arr[i] 76 | << std::endl; 77 | return 1; 78 | } 79 | } 80 | std::cout << "omp target teams distribute passed!" << std::endl; 81 | } 82 | -------------------------------------------------------------------------------- /tests/math/sin_cos.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | bool failed = false; 5 | 6 | template 7 | void test_sin_cos(T x) 8 | { 9 | T res_sin, res_cos; 10 | 11 | #pragma omp target map(from: res_sin, res_cos) 12 | { 13 | res_sin = std::sin(x); 14 | res_cos = std::cos(x); 15 | } 16 | 17 | if (res_sin != std::sin(x)) 18 | { 19 | std::cout << "sincos sin part " << res_sin << " std::sin " << std::sin(x) << std::endl; 20 | failed = true; 21 | } 22 | if (res_cos != std::cos(x)) 23 | { 24 | std::cout << "sincos cos part " << res_cos << " std::cos " << std::cos(x) << std::endl; 25 | failed = true; 26 | } 27 | } 28 | 29 | int main() 30 | { 31 | 32 | #if !defined(C_ONLY) 33 | test_sin_cos(0.0); 34 | test_sin_cos(0.0); 35 | #endif 36 | 37 | #pragma omp target 38 | { 39 | double res; 40 | res = sin(1.0); 41 | } 42 | 43 | #pragma omp target 44 | { 45 | float res; 46 | res = sinf(1.0f); 47 | } 48 | 49 | return failed; 50 | } 51 | -------------------------------------------------------------------------------- /tests/math/sin_simd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | constexpr size_t N = 128; 8 | 9 | template 10 | void test_sin_simd() 11 | { 12 | T phase[N], sinval[N]; 13 | phase[0] = 0.0; 14 | phase[1] = 0.1; 15 | phase[2] = 0.2; 16 | phase[3] = 0.3; 17 | 18 | #pragma omp simd 19 | for(int i = 0; i < N; i++) 20 | { 21 | sinval[i] = std::sin(phase[i]); 22 | //std::cout << std::setprecision(14) << sinval[i] << std::endl; 23 | } 24 | 25 | std::cout << std::setprecision(14); 26 | std::cout << "sinval[0] " << sinval[0] << " ref " << 0 << std::endl; 27 | std::cout << "sinval[1] " << sinval[1] << " ref " << 0.099833416646828 << std::endl; 28 | std::cout << "sinval[2] " << sinval[2] << " ref " << 0.19866933079506 << std::endl; 29 | std::cout << "sinval[3] " << sinval[3] << " ref " << 0.29552020666134 << std::endl; 30 | assert( std::fabs(sinval[0]) < 1e-6); 31 | assert( std::fabs(sinval[1] - 0.099833416646828) < 1e-6); 32 | assert( std::fabs(sinval[2] - 0.19866933079506) < 1e-6); 33 | assert( std::fabs(sinval[3] - 0.29552020666134) < 1e-6); 34 | } 35 | 36 | int main() 37 | { 38 | test_sin_simd(); 39 | test_sin_simd(); 40 | } 41 | -------------------------------------------------------------------------------- /tests/math/sincos.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | bool failed = false; 5 | 6 | // single precision wrapper 7 | inline void sincos(float x, float* __restrict__ sin, float* __restrict__ cos) 8 | { 9 | sincosf(x, sin, cos); 10 | } 11 | 12 | template 13 | void test_sincos(T x) 14 | { 15 | T res_sin, res_cos; 16 | 17 | #pragma omp target map(from: res_sin, res_cos) 18 | { 19 | sincos(x, &res_sin, &res_cos); 20 | } 21 | 22 | if (res_sin != std::sin(x)) 23 | { 24 | std::cout << "sincos sin part " << res_sin << " std::sin " << std::sin(x) << std::endl; 25 | failed = true; 26 | } 27 | if (res_cos != std::cos(x)) 28 | { 29 | std::cout << "sincos cos part " << res_cos << " std::cos " << std::cos(x) << std::endl; 30 | failed = true; 31 | } 32 | } 33 | 34 | int main(int argc, char **argv) 35 | { 36 | 37 | #if !defined(C_ONLY) 38 | test_sincos(0.0); 39 | test_sincos(0.0); 40 | #endif 41 | 42 | #pragma omp target 43 | { 44 | double s, c; 45 | sincos(0, &s, &c); 46 | } 47 | 48 | #pragma omp target 49 | { 50 | float s, c; 51 | sincosf(0.f, &s, &c); 52 | } 53 | 54 | return failed; 55 | } 56 | -------------------------------------------------------------------------------- /tests/math/sincos_simd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | constexpr size_t N = 128; 8 | 9 | int main() 10 | { 11 | double phase[N], sinval[N], cosval[N]; 12 | phase[0] = 0.0; 13 | phase[1] = 0.1; 14 | phase[2] = 0.2; 15 | phase[3] = 0.3; 16 | 17 | #pragma omp simd 18 | for(int i = 0; i < N; i++) 19 | { 20 | sincos(phase[i], &sinval[i], &cosval[i]); 21 | //std::cout << std::setprecision(14) << sinval[i] << " " << cosval[i] << std::endl; 22 | } 23 | 24 | std::cout << std::setprecision(14); 25 | std::cout << "sinval[0] " << sinval[0] << " ref " << 0 << std::endl; 26 | std::cout << "sinval[1] " << sinval[1] << " ref " << 0.099833416646828 << std::endl; 27 | std::cout << "sinval[2] " << sinval[2] << " ref " << 0.19866933079506 << std::endl; 28 | std::cout << "sinval[3] " << sinval[3] << " ref " << 0.29552020666134 << std::endl; 29 | std::cout << "cosval[0] " << cosval[0] << " ref " << 1 << std::endl; 30 | std::cout << "cosval[1] " << cosval[1] << " ref " << 0.99500416527803 << std::endl; 31 | std::cout << "cosval[2] " << cosval[2] << " ref " << 0.98006657784124 << std::endl; 32 | std::cout << "cosval[3] " << cosval[3] << " ref " << 0.95533648912561 << std::endl; 33 | assert( std::fabs(sinval[0]) < 1e-6); 34 | assert( std::fabs(sinval[1] - 0.099833416646828) < 1e-6); 35 | assert( std::fabs(sinval[2] - 0.19866933079506) < 1e-6); 36 | assert( std::fabs(sinval[3] - 0.29552020666134) < 1e-6); 37 | assert( std::fabs(cosval[0] - 1) < 1e-6); 38 | assert( std::fabs(cosval[1] - 0.99500416527803) < 1e-6); 39 | assert( std::fabs(cosval[2] - 0.98006657784124) < 1e-6); 40 | assert( std::fabs(cosval[3] - 0.95533648912561) < 1e-6); 41 | } 42 | -------------------------------------------------------------------------------- /tests/math/sincos_simd_template.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | constexpr size_t N = 128; 9 | 10 | inline void sincos(float phi, float* __restrict__ s, float* __restrict__ c) 11 | { 12 | sincosf(phi, s, c); 13 | } 14 | 15 | template 16 | void test_sincos() 17 | { 18 | T phase[N]; 19 | std::complex scval[N]; 20 | for(int i = 0; i < N; i++) 21 | phase[i] = 0.1 * i; 22 | 23 | T sum_r(0), sum_i(0); 24 | for(int i = 0; i < N; i++) 25 | { 26 | T s, c; 27 | sincos(phase[i], &s, &c); 28 | scval[i] = {s, c}; 29 | sum_r += s; 30 | sum_i += c; 31 | } 32 | 33 | std::cout << std::setprecision(14) << "--------------------------" << std::endl; 34 | std::cout << "sinval[0] " << scval[0].real() << " ref " << 0 << std::endl; 35 | std::cout << "sinval[1] " << scval[1].real() << " ref " << 0.099833416646828 << std::endl; 36 | std::cout << "sinval[2] " << scval[2].real() << " ref " << 0.19866933079506 << std::endl; 37 | std::cout << "sinval[3] " << scval[3].real() << " ref " << 0.29552020666134 << std::endl; 38 | std::cout << "cosval[0] " << scval[0].imag() << " ref " << 1 << std::endl; 39 | std::cout << "cosval[1] " << scval[1].imag() << " ref " << 0.99500416527803 << std::endl; 40 | std::cout << "cosval[2] " << scval[2].imag() << " ref " << 0.98006657784124 << std::endl; 41 | std::cout << "cosval[3] " << scval[3].imag() << " ref " << 0.95533648912561 << std::endl; 42 | assert( std::fabs(scval[0].real()) < 1e-6); 43 | assert( std::fabs(scval[1].real() - 0.099833416646828) < 1e-6); 44 | assert( std::fabs(scval[2].real() - 0.19866933079506) < 1e-6); 45 | assert( std::fabs(scval[3].real() - 0.29552020666134) < 1e-6); 46 | assert( std::fabs(scval[0].imag() - 1) < 1e-6); 47 | assert( std::fabs(scval[1].imag() - 0.99500416527803) < 1e-6); 48 | assert( std::fabs(scval[2].imag() - 0.98006657784124) < 1e-6); 49 | assert( std::fabs(scval[3].imag() - 0.95533648912561) < 1e-6); 50 | 51 | std::cout << "sum_r " << sum_r << " sum_i " << sum_i << std::endl; 52 | assert( std::fabs(sum_r - 0.1556929974475) < 1e-4); 53 | assert( std::fabs(sum_i - 2.3267523980062) < 1e-4); 54 | } 55 | 56 | int main() 57 | { 58 | test_sincos(); 59 | test_sincos(); 60 | } 61 | -------------------------------------------------------------------------------- /tests/math/sqrt_simd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | constexpr size_t N = 128; 8 | 9 | template 10 | void test_sqrt_simd() 11 | { 12 | T phase[N], sqrtval[N]; 13 | phase[0] = 0.0; 14 | phase[1] = 0.1; 15 | phase[2] = 0.2; 16 | phase[3] = 0.3; 17 | 18 | #pragma omp simd 19 | for(int i = 0; i < N; i++) 20 | { 21 | sqrtval[i] = std::sqrt(phase[i]); 22 | //std::cout << std::setprecision(14) << sqrtval[i] << std::endl; 23 | } 24 | 25 | std::cout << std::setprecision(14); 26 | std::cout << "sqrtval[0] " << sqrtval[0] << " ref " << 0 << std::endl; 27 | std::cout << "sqrtval[1] " << sqrtval[1] << " ref " << 0.31622776601684 << std::endl; 28 | std::cout << "sqrtval[2] " << sqrtval[2] << " ref " << 0.44721359549996 << std::endl; 29 | std::cout << "sqrtval[3] " << sqrtval[3] << " ref " << 0.54772255750517 << std::endl; 30 | assert( std::fabs(sqrtval[0]) < 1e-6); 31 | assert( std::fabs(sqrtval[1] - 0.31622776601684) < 1e-6); 32 | assert( std::fabs(sqrtval[2] - 0.44721359549996) < 1e-6); 33 | assert( std::fabs(sqrtval[3] - 0.54772255750517) < 1e-6); 34 | } 35 | 36 | int main() 37 | { 38 | test_sqrt_simd(); 39 | test_sqrt_simd(); 40 | } 41 | -------------------------------------------------------------------------------- /tests/omphost/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | foreach(NAME host_bug_libomp) 3 | set(FULLNAME ${NAME}) 4 | set(EXE_NAME cxx.${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | -------------------------------------------------------------------------------- /tests/omphost/README.md: -------------------------------------------------------------------------------- 1 | Bug report 2 | https://bugs.llvm.org/show_bug.cgi?id=42393 3 | 4 | ``` 5 | $ icpx -fiopenmp -fopenmp-targets=spir64 debug.cpp 6 | 7 | $ OMP_NUM_THREADS=2 OMP_TARGET_OFFLOAD=MANDATORY ./a.out 8 | tid = 1 9 | 0 1 2 3 10 | tid = 0 11 | 0 1 2 3 12 | 13 | $ OMP_NUM_THREADS=2 OMP_TARGET_OFFLOAD=DISABLED ./a.out 14 | tid = 0 15 | 0 1 0 0 16 | tid = 1 17 | 0 0 2 3 18 | 19 | $ icpx -fiopenmp debug.cpp 20 | tid = 0 21 | 0 1 0 0 22 | tid = 1 23 | 0 0 2 3 24 | ``` 25 | -------------------------------------------------------------------------------- /tests/omphost/host_bug_libomp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_thread_num() { return 0; } 6 | int omp_get_num_threads() { return 1; } 7 | #endif 8 | 9 | int main() 10 | { 11 | const int size = 4; 12 | int wrong_counts = 0; 13 | #pragma omp parallel reduction(+:wrong_counts) 14 | { 15 | int A[size]; 16 | for(int i = 0; i < size; i++) 17 | A[i] = 0; 18 | 19 | #pragma omp target teams distribute map(tofrom: A[:size]) 20 | for(int i = 0; i < size; i++) 21 | { 22 | A[i] = i; 23 | } 24 | 25 | #pragma omp critical 26 | { 27 | std::cout << "tid = " << omp_get_thread_num() << std::endl; 28 | for(int i = 0; i < size; i++) 29 | { 30 | if (A[i] != i) wrong_counts++; 31 | std::cout << " " << A[i]; 32 | } 33 | std::cout << std::endl; 34 | } 35 | } 36 | 37 | if (wrong_counts) 38 | std::cout << "Wrong!" << std::endl; 39 | else 40 | std::cout << "Right!" << std::endl; 41 | return wrong_counts; 42 | } 43 | -------------------------------------------------------------------------------- /tests/private/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | foreach(NAME teams_private__distribute _teams_private__distribute teams_distribute_private teams__distribute_private _teams_distribute_private _teams__distribute_private teams_distribute_parallel_for_private) 3 | set(FULLNAME target_${NAME}) 4 | set(EXE_NAME cxx.${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | 15 | if (ENABLE_Fortran) 16 | foreach(NAME teams_distribute_parallel_for_private teams_distribute_private local_block) 17 | set(FULLNAME target_${NAME}) 18 | set(EXE_NAME f.${FULLNAME}) 19 | add_executable(${EXE_NAME} ${FULLNAME}.f90) 20 | target_link_libraries(${EXE_NAME} dummy_openmp_runtime) 21 | set_target_properties(${EXE_NAME} PROPERTIES 22 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 23 | add_test(NAME ${EXE_NAME} 24 | COMMAND $ 25 | WORKING_DIRECTORY ${TESTS_BINDIR}) 26 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran) 27 | endforeach() 28 | endif() 29 | -------------------------------------------------------------------------------- /tests/private/run_all.sh: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CC_FLAGS="-O3 -fopenmp -foffload=nvptx-none" 3 | 4 | #CC=clang 5 | #CC_FLAGS="-O3 -fopenmp -fopenmp-targets=nvptx64" 6 | 7 | for name in *.c 8 | do 9 | echo Testing $name 10 | $CC $CC_FLAGS $name 11 | ./a.out 12 | echo 13 | done 14 | -------------------------------------------------------------------------------- /tests/private/target__teams__distribute_private.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_team_num() { return 1; } 6 | #endif 7 | 8 | int main() 9 | { 10 | const int Nteams = 2; 11 | void* pointer[Nteams]; 12 | int team_ID[Nteams]; 13 | float a; 14 | float* a_p; 15 | #pragma omp target map(from:pointer[:Nteams], a_p, team_ID[:Nteams]) 16 | #pragma omp teams num_teams(Nteams) 17 | { 18 | a_p = &a; 19 | #pragma omp distribute private(a) 20 | for(int i = 0; i 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_team_num() { return 1; } 6 | #endif 7 | 8 | int main() 9 | { 10 | const int Nteams = 2; 11 | void* pointer[Nteams]; 12 | int team_ID[Nteams]; 13 | float a; 14 | #pragma omp target map(from:pointer[:Nteams], team_ID[:Nteams]) 15 | { 16 | #pragma omp teams distribute num_teams(Nteams) private(a) 17 | for(int i = 0; i 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_team_num() { return 1; } 6 | #endif 7 | 8 | int main() 9 | { 10 | const int Nteams = 2; 11 | void* pointer[Nteams]; 12 | int team_ID[Nteams]; 13 | float a; 14 | #pragma omp target map(from:pointer[:Nteams], team_ID[:Nteams]) 15 | #pragma omp teams num_teams(Nteams) private(a) 16 | { 17 | #pragma omp distribute 18 | for(int i = 0; i 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_team_num() { return 1; } 6 | int omp_get_thread_num() { return 1; } 7 | #endif 8 | 9 | int main() 10 | { 11 | const int Nteams = 3; 12 | const int Nthreads = 3; 13 | const int Ntot = Nteams*Nthreads; 14 | void* pointer[Ntot]; 15 | int team_ID[Ntot]; 16 | int thread_ID[Ntot]; 17 | float a; 18 | #pragma omp target teams distribute parallel for num_teams(Nteams) thread_limit(Nthreads) private(a) map(from:pointer[:Ntot], team_ID[:Ntot], thread_ID[:Ntot]) 19 | for(int i = 0; i 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_team_num() { return 1; } 6 | #endif 7 | 8 | int main() 9 | { 10 | const int Nteams = 3; 11 | void* pointer[Nteams]; 12 | int team_ID[Nteams]; 13 | float a; 14 | #pragma omp target teams distribute num_teams(Nteams) private(a) map(from:pointer[:Nteams], team_ID[:Nteams]) 15 | for(int i = 0; i 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | int omp_get_team_num() { return 1; } 6 | #endif 7 | 8 | int main() 9 | { 10 | const int Nteams = 2; 11 | void* pointer[Nteams]; 12 | int team_ID[Nteams]; 13 | float a; 14 | #pragma omp target teams num_teams(Nteams) private(a) map(from:pointer[:Nteams], team_ID[:Nteams]) 15 | { 16 | #pragma omp distribute 17 | for(int i = 0; i 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | -------------------------------------------------------------------------------- /tests/reduction/README.md: -------------------------------------------------------------------------------- 1 | # GCC requres UDR 2 | ``` 3 | g++ -O3 -fopenmp -foffload=disable array_reduction.cpp && ./a.out 4 | g++ -O3 -fopenmp -foffload=nvptx-none -foffload-options="-lm -latomic" array_reduction.cpp && ./a.out 5 | ``` 6 | 7 | # Clang allows both with and without UDR 8 | ``` 9 | clang++ -O3 -fopenmp array_reduction.cpp && ./a.out 10 | clang++ -O3 -fopenmp -fopenmp-targets=nvptx64 array_reduction.cpp && ./a.out 11 | clang++ -O3 -fopenmp -D__NO_UDR array_reduction.cpp && ./a.out 12 | clang++ -O3 -fopenmp -fopenmp-targets=nvptx64 -D__NO_UDR array_reduction.cpp && ./a.out 13 | ``` 14 | 15 | # NVHPC disallow UDR 16 | ``` 17 | nvc++ -O3 -mp -D__NO_UDR array_reduction.cpp # doesn't work 18 | nvc++ -O3 -mp=gpu -D__NO_UDR array_reduction.cpp # doesn't work 19 | ``` 20 | -------------------------------------------------------------------------------- /tests/reduction/array_reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | struct GradType 5 | { 6 | float X[3]{0, 0, 0}; 7 | float operator[](size_t i) const { return X[i]; } 8 | float& operator[](size_t i) { return X[i]; } 9 | }; 10 | 11 | GradType& operator+=(GradType& a, const GradType& b) 12 | { 13 | for (int i = 0; i < 3; i++) 14 | a[i]+=b[i]; 15 | return a; 16 | } 17 | 18 | #if !defined(__NO_UDR) 19 | #pragma omp declare reduction(+ : GradType : omp_out += omp_in) 20 | #endif 21 | 22 | void test_size(size_t N) 23 | { 24 | std::cout << std::endl << "Testing size " << N << std::endl; 25 | GradType grads{0, 0, 0}; 26 | #pragma omp parallel for reduction(+: grads) 27 | for (int i = 0; i 1e-6 || std::abs(grads_saved[1] * 2 - grads[1]) > 1e-6 || std::abs(grads_saved[2] * 2 - grads[2]) > 1e-6) 41 | { 42 | std::cout << "Failed!" << std::endl; 43 | exit(1); 44 | } 45 | else 46 | std::cout << "Passed!" << std::endl; 47 | } 48 | 49 | int main() 50 | { 51 | std::cout << "Start testing!" << std::endl; 52 | test_size(9); 53 | test_size(3); 54 | test_size(5); 55 | test_size(7); 56 | test_size(13); 57 | test_size(15); 58 | test_size(17); 59 | test_size(25); 60 | test_size(31); 61 | test_size(35); 62 | test_size(65); 63 | test_size(8); 64 | test_size(16); 65 | test_size(32); 66 | test_size(64); 67 | test_size(128); 68 | test_size(256); 69 | std::cout << "End testing!" << std::endl; 70 | } 71 | -------------------------------------------------------------------------------- /tests/sollve_vv/sollve_vv_aomp.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/SOLLVE/sollve_vv.git 2 | # this is so we can avoid hanging 3 | sed -i s/timeout/"timeout -s 9"/ sollve_vv/sys/scripts/run_test.sh 4 | cd sollve_vv 5 | OFFLOAD_FLAG="-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906" 6 | make OMP_VERSION=4.5 CC="clang -std=c99 $OFFLOAD_FLAG" \ 7 | CXX="clang++ -std=c++11 $OFFLOAD_FLAG" \ 8 | FC="flang $OFFLOAD_FLAG" \ 9 | VERBOSE_TESTS=1 VERBOSE=1 LOG=1 LOG_ALL=1 all 10 | make report_summary &> 45.out 11 | make tidy 12 | make OMP_VERSION=5.0 CC="clang -std=c99 -fopenmp-version=50 $OFFLOAD_FLAG" \ 13 | CXX="clang++ -std=c++11 -fopenmp-version=50 $OFFLOAD_FLAG" \ 14 | FC="flang $OFFLOAD_FLAG" \ 15 | VERBOSE_TESTS=1 VERBOSE=1 LOG=1 LOG_ALL=1 all 16 | make report_summary &> 50.out 17 | cd .. 18 | -------------------------------------------------------------------------------- /tests/target_task/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (ENABLE_CXX) 2 | foreach(NAME target_nowait_task target_taskwait taskloop_offload_nowait taskloop omp-task-bug target_nowait_taskwait target_update_nowait_taskwait) 3 | set(FULLNAME ${NAME}) 4 | set(EXE_NAME cxx.${FULLNAME}) 5 | add_executable(${EXE_NAME} ${FULLNAME}.cpp) 6 | set_target_properties(${EXE_NAME} PROPERTIES 7 | RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR}) 8 | add_test(NAME ${EXE_NAME} 9 | COMMAND $ 10 | WORKING_DIRECTORY ${TESTS_BINDIR}) 11 | set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx) 12 | endforeach() 13 | endif() 14 | -------------------------------------------------------------------------------- /tests/target_task/omp-task-bug.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef _OPENMP 6 | #include 7 | #else 8 | int omp_get_thread_num() { return 0; } 9 | int omp_get_num_threads() { return 1; } 10 | int omp_get_max_threads() { return 1; } 11 | #endif 12 | 13 | template 14 | struct MyProblem 15 | { 16 | int M = 16; 17 | int N = 16; 18 | int K = 32; 19 | int Size = 0; 20 | int IP = 0; 21 | T* V = nullptr; 22 | T* W = nullptr; 23 | 24 | explicit MyProblem(int np) : Size(M * N * K / np) 25 | { 26 | M = M / np; 27 | size_t bytes = Size * sizeof(T); 28 | auto* v_ptr = (T*)aligned_alloc(64, bytes); 29 | auto* w_ptr = (T*)aligned_alloc(64, bytes); 30 | 31 | #pragma omp target enter data map(alloc : v_ptr [0:Size], w_ptr [0:Size]) 32 | 33 | V = v_ptr; 34 | W = w_ptr; 35 | 36 | #pragma omp target enter data map(to : this [0:1]) 37 | } 38 | 39 | ~MyProblem() 40 | { 41 | auto* v_ptr = V; 42 | auto* w_ptr = W; 43 | 44 | #pragma omp target exit data map(delete : this [0:1]) 45 | 46 | #pragma omp target exit data map(delete : v_ptr[:Size], w_ptr[:Size]) 47 | 48 | free(W); 49 | free(V); 50 | } 51 | 52 | void setV(int ip) 53 | { 54 | IP = ip; 55 | std::iota(V, V + Size, T(ip * Size)); 56 | } 57 | 58 | void update() 59 | { 60 | // v_ptr and w_ptr are shared as a task is created 61 | auto* v_ptr = V; 62 | auto* w_ptr = W; 63 | #pragma omp target teams distribute collapse(2) map(always, to : v_ptr[:Size]) nowait depend(out : w_ptr[:Size]) 64 | for (int i = 0; i < M; ++i) 65 | for (int j = 0; j < N; ++j) 66 | { 67 | #pragma omp parallel for 68 | for (int k = 0; k < K; ++k) 69 | { 70 | int ijk = i * N * K + j * K + k; 71 | w_ptr[ijk] = 0.1f + v_ptr[ijk]; 72 | } 73 | } 74 | 75 | #pragma omp target update nowait depend(inout : w_ptr[:Size]) from(w_ptr[:Size]) 76 | 77 | #if defined(INPLACE_TASKWAIT) 78 | #pragma omp taskwait 79 | #endif 80 | } 81 | 82 | void write() const 83 | { 84 | std::cout << "result: " << IP << std::endl; 85 | std::cout << "V[" << 0 << "] = " << V[0] << " " << W[0] << std::endl; 86 | std::cout << "V[" << Size / 2 << "] = " << V[Size / 2] << " " << W[Size / 2] << std::endl; 87 | std::cout << "V[" << Size - 1 << "] = " << V[Size - 1] << " " << W[Size - 1] << std::endl; 88 | } 89 | }; 90 | 91 | int main(int argc, char** argv) 92 | { 93 | const int np = omp_get_max_threads(); 94 | 95 | std::vector>> problems(np * 4); 96 | 97 | #pragma omp parallel 98 | { 99 | int ip = omp_get_thread_num(); 100 | 101 | for (int iw = 0; iw < 4; iw++) 102 | { 103 | int I = ip * 4 + iw; 104 | problems[I] = std::make_unique>(np * 4); 105 | problems[I]->setV(I); 106 | } 107 | 108 | for (int iw = 0; iw < 4; iw++) 109 | { 110 | int I = ip * 4 + iw; 111 | problems[I]->update(); 112 | } 113 | } 114 | 115 | /* 116 | for(int ip=0; ipwrite(); 119 | } 120 | */ 121 | } 122 | -------------------------------------------------------------------------------- /tests/target_task/target_nowait_task.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | int a = 0; 7 | std::cout << "outside a = " << a << " addr " << &a << std::endl; 8 | #pragma omp target map(tofrom: a) depend(out: a) nowait 9 | { 10 | int sum = 0; 11 | for (int i = 0; i < 100000; i++) 12 | sum++; 13 | a = 1; 14 | } 15 | 16 | #pragma omp task depend(in: a) shared(a) 17 | { 18 | std::cout << "a = " << a << " addr " << &a << std::endl; 19 | if (a != 1) 20 | throw std::runtime_error("wrong result!"); 21 | } 22 | 23 | #pragma omp taskwait 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /tests/target_task/target_nowait_taskwait.cpp: -------------------------------------------------------------------------------- 1 | ////////////////////////////////////////////////////////////////////////////////////// 2 | // This file is distributed under the University of Illinois/NCSA Open Source License. 3 | // See LICENSE file in top directory for details. 4 | // 5 | // Copyright (c) 2019 QMCPACK developers. 6 | // 7 | // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory 8 | // 9 | // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory 10 | ////////////////////////////////////////////////////////////////////////////////////// 11 | 12 | 13 | #include 14 | #include 15 | 16 | const int num_sections = 1; 17 | const int section_size = 100; 18 | constexpr int array_size = num_sections * section_size; 19 | 20 | int main(int argc, char** argv) 21 | { 22 | //std::vector> array(array_size, 1); 23 | std::vector array(array_size, 1); 24 | int* array_ptr = array.data(); 25 | #pragma omp target enter data map(alloc:array_ptr[:array_size]) 26 | 27 | #pragma omp target update to(array_ptr[:array_size]) 28 | #pragma omp target teams distribute parallel for map(tofrom: array_ptr[:array_size]) 29 | for (int i = 0; i < array_size; i++) 30 | { 31 | array_ptr[i] += i; 32 | } 33 | 34 | for (int offset = 0; offset < array_size; offset += section_size) 35 | { 36 | #pragma omp target update from(array_ptr[offset:section_size]) nowait 37 | } 38 | #pragma omp taskwait 39 | #pragma omp target exit data map(delete:array_ptr[:array_size]) 40 | 41 | if(array_ptr[4] != 5) 42 | throw std::runtime_error("array_ptr[4] check failed after update!"); 43 | if(array_ptr[94] != 95) 44 | throw std::runtime_error("array_ptr[94] check failed after update!"); 45 | } 46 | -------------------------------------------------------------------------------- /tests/target_task/target_taskwait.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | int a = 0; 7 | std::cout << "outside a = " << a << " addr " << &a << std::endl; 8 | #pragma omp target nowait 9 | { 10 | int sum = 0; 11 | for (int i = 0; i < 100000; i++) 12 | sum++; 13 | a = 1; 14 | } 15 | 16 | #pragma omp taskwait 17 | return 0; 18 | } 19 | -------------------------------------------------------------------------------- /tests/target_task/target_update_nowait_taskwait.cpp: -------------------------------------------------------------------------------- 1 | ////////////////////////////////////////////////////////////////////////////////////// 2 | // This file is distributed under the University of Illinois/NCSA Open Source License. 3 | // See LICENSE file in top directory for details. 4 | // 5 | // Copyright (c) 2023 QMCPACK developers. 6 | // 7 | // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory 8 | // 9 | // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory 10 | ////////////////////////////////////////////////////////////////////////////////////// 11 | 12 | 13 | #include 14 | #include 15 | 16 | const int num_sections = 1; 17 | const int section_size = 100; 18 | constexpr int array_size = num_sections * section_size; 19 | 20 | int main(int argc, char** argv) 21 | { 22 | //std::vector> array(array_size, 1); 23 | std::vector array(array_size, 1); 24 | int* array_ptr = array.data(); 25 | #pragma omp target enter data map(alloc:array_ptr[:array_size]) 26 | for (int offset = 0; offset < array_size; offset += section_size) 27 | { 28 | #pragma omp target update from(array_ptr[offset:section_size]) nowait 29 | } 30 | #pragma omp taskwait 31 | #pragma omp target exit data map(delete:array_ptr[:array_size]) 32 | } 33 | -------------------------------------------------------------------------------- /tests/target_task/taskloop.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | template 7 | class RefVectorWithLeader : public std::vector> 8 | { 9 | public: 10 | RefVectorWithLeader(T& leader) : leader_(leader) {} 11 | 12 | RefVectorWithLeader(T& leader, const std::vector>& vec) : leader_(leader) 13 | { 14 | for (T& element : vec) 15 | this->push_back(element); 16 | } 17 | 18 | T& getLeader() const { return leader_; } 19 | 20 | T& operator[](size_t i) const { return std::vector>::operator[](i).get(); } 21 | 22 | template 23 | CASTTYPE& getCastedLeader() const 24 | { 25 | static_assert(std::is_const::value == std::is_const::value, "Unmatched const type qualifier!"); 26 | #ifndef NDEBUG 27 | assert(dynamic_cast(&leader_.get()) != nullptr); 28 | #endif 29 | return static_cast(leader_.get()); 30 | } 31 | 32 | template 33 | CASTTYPE& getCastedElement(size_t i) const 34 | { 35 | static_assert(std::is_const::value == std::is_const::value, "Unmatched const type qualifier!"); 36 | #ifndef NDEBUG 37 | assert(dynamic_cast(&(*this)[i]) != nullptr); 38 | #endif 39 | return static_cast((*this)[i]); 40 | } 41 | 42 | private: 43 | std::reference_wrapper leader_; 44 | }; 45 | 46 | class TWF 47 | { 48 | public: 49 | static void mw_accept_rejectMove(const RefVectorWithLeader& wf_list) 50 | { 51 | auto& wf_leader = wf_list.getLeader(); 52 | const int vec_size = wf_list.size(); 53 | std::cout << "vec size outside " << vec_size << " addr " << &wf_list << std::endl; 54 | #pragma omp taskloop default(shared) if(wf_leader.use_tasking) 55 | for(int i=0; i<2; i++) 56 | { 57 | std::cout << "vec size inside " << wf_list.size() << " addr " << &wf_list << std::endl; 58 | if (vec_size != wf_list.size()) 59 | throw std::runtime_error("mismatched size!"); 60 | } 61 | } 62 | 63 | private: 64 | bool use_tasking = false; 65 | }; 66 | 67 | int main() 68 | { 69 | std::vector twf(2); 70 | std::vector> twf_ref; 71 | twf_ref.push_back(twf[0]); 72 | twf_ref.push_back(twf[1]); 73 | RefVectorWithLeader twf_crowd(twf[0], twf_ref); 74 | TWF::mw_accept_rejectMove(twf_crowd); 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /tests/target_task/taskloop_offload_nowait.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | bool almost_equal(float x, float gold, float tol) { 5 | if ( std::signbit(x) != std::signbit(gold) ) 6 | { 7 | x = std::abs(gold) - std::abs(x); 8 | } 9 | return std::abs(gold) * (1-tol) <= std::abs(x) && std::abs(x) <= std::abs(gold) * (1 + tol); 10 | } 11 | 12 | int main() 13 | { 14 | const int N0 { 2 }; 15 | const int N1 { 182 }; 16 | const float expected_value { N0*N1 }; 17 | float counter_N0{}; 18 | #pragma omp target data map(tofrom: counter_N0) 19 | { 20 | #pragma omp taskloop shared(counter_N0) 21 | for (int i0 = 0 ; i0 < N0 ; i0++ ) 22 | { 23 | #pragma omp target teams distribute parallel for map(tofrom: counter_N0) nowait 24 | for (int i1 = 0 ; i1 < N1 ; i1++ ) 25 | { 26 | #pragma omp atomic update 27 | counter_N0 = counter_N0 + 1. ; 28 | } 29 | } 30 | } 31 | 32 | if (!almost_equal(counter_N0, expected_value, 0.1)) { 33 | std::cerr << "Expected: " << expected_value << " Got: " << counter_N0 << std::endl; 34 | std::exit(112); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/tasks/implicit_shared.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void foo(int& x) 4 | { 5 | #pragma omp task if(0) 6 | { 7 | x++; 8 | std::cout << "inside " << x << std::endl; 9 | } 10 | } 11 | 12 | int main() 13 | { 14 | std::cout << "Test task in-place" << std::endl; 15 | int x = 0; 16 | #pragma omp parallel 17 | { 18 | #pragma omp single 19 | { 20 | #pragma omp task if(0) 21 | { 22 | x++; 23 | std::cout << "inside " << x << std::endl; 24 | } 25 | } 26 | } 27 | std::cout << "outside " << x << std::endl; 28 | 29 | std::cout << "Test task in functon" << std::endl; 30 | x = 0; 31 | #pragma omp parallel 32 | { 33 | #pragma omp single 34 | { 35 | foo(x); 36 | } 37 | } 38 | std::cout << "outside " << x << std::endl; 39 | } 40 | --------------------------------------------------------------------------------