├── .clang-format
├── .github
    └── workflows
    │   └── ci-github-actions.yaml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    ├── DummyOpenMPRuntime.f90
    ├── FortranHelpers.cmake
    ├── TestCXXOpenMPRuntime.cmake
    └── TestFortranOpenMPRuntime.cmake
├── hands-on
    ├── CMakeLists.txt
    ├── README.md
    ├── cleanup.sh
    ├── common
    │   └── timer.h
    ├── gemm
    │   ├── 0-gemmNN-serial
    │   │   ├── 0-gemmNN-serial.cpp
    │   │   └── Makefile
    │   ├── 0-gemmNT-serial
    │   │   ├── Makefile
    │   │   └── gemmNT-serial.cpp
    │   ├── 1-gemmNN-omp-thread
    │   │   ├── 1-gemmNN-omp-thread.cpp
    │   │   └── Makefile
    │   ├── 1-gemmNT-omp
    │   │   ├── Makefile
    │   │   └── gemmNT-omp.cpp
    │   ├── 2-gemmNN-omp-target
    │   │   ├── 2-gemmNN-omp-target.cpp
    │   │   └── Makefile
    │   ├── 2-gemmNT-omp-target
    │   │   ├── Makefile
    │   │   └── gemmNT-omp-target.cpp
    │   └── gemmNN
    │   │   ├── Makefile
    │   │   └── gemmNN.cpp
    ├── gemv
    │   ├── 01-gemv-omp
    │   │   ├── Makefile
    │   │   ├── gemv-omp.cpp
    │   │   └── gemv-omp.f90
    │   ├── 02-gemv-omp-target
    │   │   ├── Makefile
    │   │   ├── gemv-omp-target.cpp
    │   │   └── gemv-omp-target.f90
    │   ├── 03-gemv-omp-target-teams
    │   │   ├── Makefile
    │   │   ├── gemv-omp-target-teams.cpp
    │   │   └── gemv-omp-target-teams.f90
    │   ├── 04-gemv-omp-target-reduction
    │   │   ├── Makefile
    │   │   ├── gemv-omp-target-reduction.cpp
    │   │   └── gemv-omp-target-reduction.f90
    │   ├── 05-gemv-omp-target-split-parallel-for-reduction
    │   │   ├── Makefile
    │   │   └── gemv-omp-target-split-parallel-for-reduction.cpp
    │   ├── 51-gemv-omp-many-matrices
    │   │   ├── Makefile
    │   │   ├── gemv-omp-many-matrices.cpp
    │   │   └── gemv-omp-many-matrices.f90
    │   ├── 52-gemv-omp-target-many-matrices-no-hierachy
    │   │   ├── Makefile
    │   │   ├── gemv-omp-target-many-matrices-no-hierachy.cpp
    │   │   └── gemv-omp-target-many-matrices-no-hierachy.f90
    │   ├── 53-gemv-omp-target-many-matrices
    │   │   ├── Makefile
    │   │   ├── gemv-omp-target-many-matrices.cpp
    │   │   └── gemv-omp-target-many-matrices.f90
    │   ├── 54-gemv-omp-target-many-matrices-multi-devices
    │   │   ├── Makefile
    │   │   └── gemv-omp-target-many-matrices-multi-devices.cpp
    │   ├── 55-gemv-omp-target-many-matrices-taskloop
    │   │   ├── Makefile
    │   │   └── gemv-omp-target-many-matrices-taskloop.cpp
    │   ├── CMakeLists.txt
    │   ├── README
    │   └── build_and_run_all.sh
    ├── make.aomp.inc
    ├── make.gcc-nv.inc
    ├── make.icx.inc
    ├── make.llvm.inc
    └── make.xl.inc
├── integration
    └── crusher_recipe
    │   ├── README
    │   └── modules
    │       └── cray-mpich-afar.lua
└── tests
    ├── CMakeLists.txt
    ├── allocator
        ├── CMakeLists.txt
        └── omp_pteam_mem_alloc.cpp
    ├── complex
        ├── CMakeLists.txt
        ├── complex.cpp
        ├── complex.f90
        ├── complex_reduction.cpp
        └── complex_reduction_cpu.cpp
    ├── cudafor_omp
        ├── README.md
        ├── noomp.f90
        ├── omp_above.f90
        └── omp_below.f90
    ├── fortran_allocator
        ├── CMakeLists.txt
        ├── dualspace.f90
        ├── dualspace_array_device.f90
        ├── dualspace_array_device_isptr.f90
        └── dualspace_array_resize.f90
    ├── fortran_use_device_ptr
        ├── CMakeLists.txt
        └── use_device_ptr_target.f90
    ├── global_variable
        ├── CMakeLists.txt
        ├── constexpr
        │   ├── CMakeLists.txt
        │   └── constexpr.cpp
        ├── global_pointer
        │   ├── CMakeLists.txt
        │   ├── Makefile
        │   ├── global.cpp
        │   ├── global.h
        │   └── main.cpp
        └── global_static
        │   ├── CMakeLists.txt
        │   ├── Makefile
        │   ├── data.cpp
        │   ├── data.hpp
        │   └── main.cpp
    ├── implict_async
        ├── CMakeLists.txt
        ├── llvm_alloc_host.cpp
        └── llvm_alloc_host_data.cpp
    ├── linking
        ├── CMakeLists.txt
        ├── link_static_fat_bin
        │   ├── CMakeLists.txt
        │   ├── classA.cpp
        │   ├── classA.h
        │   ├── compile-amd.sh
        │   ├── compile-x86.sh
        │   ├── compile.sh
        │   └── main.cpp
        ├── linker_outlined_func
        │   ├── CMakeLists.txt
        │   ├── a.cpp
        │   ├── ab.h
        │   ├── b.cpp
        │   ├── compile.sh
        │   ├── compute.h
        │   └── main.cpp
        ├── missing_bundles
        │   ├── boo.cpp
        │   ├── compile.sh
        │   ├── foo.cpp
        │   └── main.cpp
        └── two_identical_templates
        │   ├── CMakeLists.txt
        │   ├── main.cpp
        │   ├── test_a.cpp
        │   └── test_b.cpp
    ├── map
        ├── CMakeLists.txt
        ├── check_transfer.cpp
        ├── declare_target_global.cpp
        ├── first_private_this_wrong.cpp
        ├── implicit_map_alloc.f90
        ├── map_class_member.cpp
        ├── map_delete_inside_data.cpp
        ├── map_threads.cpp
        ├── pointer_api.cpp
        ├── struct_with_const.cpp
        └── this_with_virtual.cpp
    ├── math
        ├── CMakeLists.txt
        ├── FP_ZERO.cpp
        ├── README
        ├── header_only.cpp
        ├── modf.cpp
        ├── modf_in_branch.cpp
        ├── modf_team.cpp
        ├── sin_cos.cpp
        ├── sin_simd.cpp
        ├── sincos.cpp
        ├── sincos_simd.cpp
        ├── sincos_simd_template.cpp
        └── sqrt_simd.cpp
    ├── omphost
        ├── CMakeLists.txt
        ├── README.md
        └── host_bug_libomp.cpp
    ├── private
        ├── CMakeLists.txt
        ├── run_all.sh
        ├── target__teams__distribute_private.cpp
        ├── target__teams_distribute_private.cpp
        ├── target__teams_private__distribute.cpp
        ├── target_local_block.f90
        ├── target_teams__distribute_private.cpp
        ├── target_teams_distribute_parallel_for_private.cpp
        ├── target_teams_distribute_parallel_for_private.f90
        ├── target_teams_distribute_private.cpp
        ├── target_teams_distribute_private.f90
        └── target_teams_private__distribute.cpp
    ├── reduction
        ├── CMakeLists.txt
        ├── README.md
        └── array_reduction.cpp
    ├── sollve_vv
        └── sollve_vv_aomp.sh
    ├── target_task
        ├── CMakeLists.txt
        ├── omp-task-bug.cpp
        ├── target_nowait_task.cpp
        ├── target_nowait_taskwait.cpp
        ├── target_taskwait.cpp
        ├── target_update_nowait_taskwait.cpp
        ├── taskloop.cpp
        └── taskloop_offload_nowait.cpp
    └── tasks
        └── implicit_shared.cpp


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | AccessModifierOffset: -2
  4 | AlignAfterOpenBracket: Align
  5 | AlignConsecutiveAssignments: true
  6 | AlignConsecutiveDeclarations: false
  7 | AlignEscapedNewlines: Left
  8 | AlignOperands: false
  9 | AlignTrailingComments: true
 10 | AllowAllParametersOfDeclarationOnNextLine: false
 11 | AllowShortBlocksOnASingleLine: true
 12 | AllowShortCaseLabelsOnASingleLine: false
 13 | AllowShortFunctionsOnASingleLine: All
 14 | AllowShortIfStatementsOnASingleLine: false
 15 | AllowShortLoopsOnASingleLine: false
 16 | AlwaysBreakAfterDefinitionReturnType: None
 17 | AlwaysBreakAfterReturnType: None
 18 | AlwaysBreakBeforeMultilineStrings: false
 19 | AlwaysBreakTemplateDeclarations: true
 20 | BinPackArguments: true
 21 | BinPackParameters: false
 22 | BraceWrapping:
 23 |   AfterClass:      true
 24 |   AfterControlStatement: true
 25 |   AfterEnum:       true
 26 |   AfterFunction:   true
 27 |   AfterNamespace:  true
 28 |   AfterObjCDeclaration: true
 29 |   AfterStruct:     true
 30 |   AfterUnion:      true
 31 |   AfterExternBlock: true
 32 |   BeforeCatch:     true
 33 |   BeforeElse:      true
 34 |   IndentBraces:    false
 35 |   SplitEmptyFunction: false
 36 |   SplitEmptyRecord: false
 37 |   SplitEmptyNamespace: false
 38 | BreakBeforeBinaryOperators: None
 39 | BreakBeforeBraces: Custom
 40 | BreakBeforeInheritanceComma: false
 41 | BreakBeforeTernaryOperators: true
 42 | BreakConstructorInitializersBeforeComma: false
 43 | BreakConstructorInitializers: BeforeColon
 44 | BreakAfterJavaFieldAnnotations: false
 45 | BreakStringLiterals: true
 46 | ColumnLimit:     120
 47 | CommentPragmas:  '^ IWYU pragma:'
 48 | CompactNamespaces: false
 49 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 50 | ConstructorInitializerIndentWidth: 4
 51 | ContinuationIndentWidth: 4
 52 | Cpp11BracedListStyle: true
 53 | DerivePointerAlignment: false
 54 | DisableFormat:   false
 55 | ExperimentalAutoDetectBinPacking: false
 56 | FixNamespaceComments: true
 57 | ForEachMacros:
 58 |   - foreach
 59 |   - Q_FOREACH
 60 |   - BOOST_FOREACH
 61 | IncludeBlocks:   Preserve
 62 | IncludeCategories:
 63 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
 64 |     Priority:        2
 65 |   - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
 66 |     Priority:        3
 67 |   - Regex:           '.*'
 68 |     Priority:        1
 69 | IncludeIsMainRegex: '(Test)?$'
 70 | IndentCaseLabels: false
 71 | IndentPPDirectives: None
 72 | IndentWidth:     2
 73 | IndentWrappedFunctionNames: true
 74 | JavaScriptQuotes: Leave
 75 | JavaScriptWrapImports: true
 76 | KeepEmptyLinesAtTheStartOfBlocks: false
 77 | MacroBlockBegin: ''
 78 | MacroBlockEnd:   ''
 79 | MaxEmptyLinesToKeep: 2
 80 | NamespaceIndentation: None
 81 | ObjCBlockIndentWidth: 2
 82 | ObjCSpaceAfterProperty: false
 83 | ObjCSpaceBeforeProtocolList: true
 84 | PenaltyBreakAssignment: 2
 85 | PenaltyBreakBeforeFirstCallParameter: 30000
 86 | PenaltyBreakComment: 300
 87 | PenaltyBreakFirstLessLess: 120
 88 | PenaltyBreakString: 1000
 89 | PenaltyExcessCharacter: 1000000
 90 | PenaltyReturnTypeOnItsOwnLine: 10000
 91 | PointerAlignment: Left
 92 | ReflowComments:  false
 93 | SortIncludes:    false
 94 | SortUsingDeclarations: true
 95 | SpaceAfterCStyleCast: false
 96 | SpaceAfterTemplateKeyword: false
 97 | SpaceBeforeAssignmentOperators: true
 98 | SpaceBeforeParens: ControlStatements
 99 | SpaceInEmptyParentheses: false
100 | SpacesBeforeTrailingComments: 1
101 | SpacesInAngles:  false
102 | SpacesInContainerLiterals: true
103 | SpaceBeforeCtorInitializerColon: true
104 | SpaceBeforeInheritanceColon: true
105 | SpaceBeforeRangeBasedForLoopColon: true
106 | SpaceInEmptyParentheses: false
107 | SpacesInCStyleCastParentheses: false
108 | SpacesInParentheses: false
109 | SpacesInSquareBrackets: false
110 | Standard:        Cpp11
111 | TabWidth:        8
112 | UseTab:          Never
113 | ...
114 | 
115 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-github-actions.yaml:
--------------------------------------------------------------------------------
 1 | name: GitHub Actions CI
 2 | 
 3 | on: 
 4 |   push:
 5 |     branches: 
 6 |     - master
 7 |   pull_request:
 8 |     branches: 
 9 |     - master
10 | 
11 | jobs:
12 | 
13 |   linux:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 | 
18 |     steps:
19 |     - name: Checkout Action
20 |       uses: actions/checkout@v3
21 | 
22 |     - name: Setup Dependencies
23 |       run: |
24 |         sudo apt install g++-10 libgomp1
25 | 
26 |     - name: Configure
27 |       run: |
28 |         mkdir build_gcc_noomp; cd build_gcc_noomp
29 |         cmake -DCMAKE_CXX_COMPILER=g++-10 -DCMAKE_Fortran_COMPILER=gfortran-10 ..
30 | 
31 |     - name: Build
32 |       run: cd build_gcc_noomp; make -j2 -k
33 | 
34 |     - name: Test
35 |       run: cd build_gcc_noomp; ctest --output-on-failure
36 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14)
 2 | 
 3 | # set the project name
 4 | project(openmp-target LANGUAGES NONE)
 5 | 
 6 | OPTION(ENABLE_CXX "Enable/disable C++ tests" ON)
 7 | OPTION(ENABLE_Fortran "Enable/disable Fortran tests" ON)
 8 | OPTION(ENABLE_EXPERIMENTAL "Enable/disable experimental tests" OFF)
 9 | 
10 | if (NOT CMAKE_BUILD_TYPE)
11 |   set(CMAKE_BUILD_TYPE Release)
12 | endif()
13 | 
14 | if (ENABLE_CXX)
15 |   enable_language(CXX)
16 |   # requires C++14 standard
17 |   set(CMAKE_CXX_STANDARD 14)
18 |   set(CMAKE_CXX_STANDARD_REQUIRED ON)
19 |   set(CMAKE_CXX_EXTENSIONS OFF)
20 |   include(cmake/TestCXXOpenMPRuntime.cmake)
21 | endif()
22 | 
23 | if (ENABLE_Fortran)
24 |   enable_language(Fortran)
25 |   include(cmake/FortranHelpers.cmake)
26 |   include(cmake/TestFortranOpenMPRuntime.cmake)
27 | endif()
28 | 
29 | enable_testing()
30 | 
31 | if (CMAKE_CXX_COMPILER_ID MATCHES "PGI" OR CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
32 |   add_definitions(-D__NO_UDR)
33 | endif()
34 | 
35 | add_subdirectory(hands-on)
36 | add_subdirectory(tests)
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |                   The 3-Clause BSD License
 2 | Copyright (c) 2019-2021 Ye Luo. All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 |     Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 |     Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 |     Neither the name of the <copyright holder> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A collection of OpenMP tests for C++ and Fortran compilers
 2 | 
 3 | Recipe example
 4 | ```
 5 | mkdir build_gcc_omp
 6 | cd build_gcc_omp
 7 | cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran \
 8 |       -DCMAKE_CXX_FLAGS=-fopenmp -DCMAKE_Fortran_FLAGS=-fopenmp \
 9 |       ..
10 | make -k -j 16
11 | ctest
12 | ```
13 | 
14 | ###
15 | OpenMP offload compiler options
16 | https://github.com/ye-luo/openmp-target/wiki/OpenMP-offload-compilers
17 | 


--------------------------------------------------------------------------------
/cmake/DummyOpenMPRuntime.f90:
--------------------------------------------------------------------------------
 1 | module omp_lib
 2 | contains
 3 | function omp_get_num_threads()
 4 |   implicit none
 5 |   integer omp_get_num_threads
 6 |   omp_get_num_threads = 1
 7 | end function omp_get_num_threads
 8 | 
 9 | function omp_get_num_teams()
10 |   implicit none
11 |   integer omp_get_num_teams
12 |   omp_get_num_teams = 1
13 | end function omp_get_num_teams
14 | 
15 | function omp_get_thread_num()
16 |   implicit none
17 |   integer omp_get_thread_num
18 |   omp_get_thread_num = 0
19 | end function omp_get_thread_num
20 | 
21 | function omp_get_team_num()
22 |   implicit none
23 |   integer omp_get_team_num
24 |   omp_get_team_num = 0
25 | end function omp_get_team_num
26 | end module
27 | 


--------------------------------------------------------------------------------
/cmake/FortranHelpers.cmake:
--------------------------------------------------------------------------------
 1 | function(fix_fortran_modules TGT)
 2 |     set(targets ${TGT} ${ARGN})
 3 |     foreach(tgt IN LISTS targets)
 4 |         get_target_property(tgt_type ${tgt} TYPE)
 5 |         # All of the following target modifications make
 6 |         # sense on non-interfaces only
 7 |         if(NOT ${tgt_type} STREQUAL "INTERFACE_LIBRARY")
 8 |             get_target_property(tgt_module_dir ${tgt} Fortran_MODULE_DIRECTORY)
 9 |             # set module path to tgt_binary_dir/mod
10 |             get_target_property(tgt_binary_dir ${tgt} BINARY_DIR)
11 |             set_target_properties(${tgt}
12 |                 PROPERTIES
13 |                     Fortran_MODULE_DIRECTORY ${tgt_binary_dir}/mod/${TGT})
14 |             # make module directory available for clients of TGT
15 |             target_include_directories(${tgt}
16 |                 PUBLIC
17 |                     $<BUILD_INTERFACE:${tgt_binary_dir}/mod/${TGT}>
18 |                 INTERFACE
19 |                     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/qe/${TGT}>)
20 |         endif()
21 |     endforeach()
22 | endfunction(fix_fortran_modules)
23 | 


--------------------------------------------------------------------------------
/cmake/TestCXXOpenMPRuntime.cmake:
--------------------------------------------------------------------------------
 1 | include(CheckCXXSourceCompiles)
 2 | 
 3 | list(PREPEND OPENMP_CXX_COMPILE_OPTIONS ${OPENMP_CXX_FLAGS} ${OPENMP_FLAGS})
 4 | list(PREPEND OPENMP_CXX_LINK_OPTIONS ${OPENMP_CXX_FLAGS} ${OPENMP_FLAGS})
 5 | 
 6 | set(CMAKE_REQUIRED_FLAGS ${OPENMP_CXX_COMPILE_OPTIONS})
 7 | #set(CMAKE_REQUIRED_LINK_OPTIONS ${OPENMP_CXX_LINK_OPTIONS})
 8 | 
 9 | check_cxx_source_compiles(
10 | "#include <omp.h>
11 | int main()
12 | { int a = omp_get_num_threads(); }"
13 | CXX_OPENMP_RUNTIME_OKAY
14 | )
15 | 
16 | add_library(qmc_openmp_cxx INTERFACE)
17 | 
18 | if (CXX_OPENMP_RUNTIME_OKAY)
19 |   message(STATUS "C++ OpenMP functionality check failed!")
20 | else()
21 |   message(STATUS "C++ OpenMP functionality check pass")
22 |   target_compile_options(qmc_openmp_cxx INTERFACE "${OPENMP_CXX_COMPILE_OPTIONS}")
23 |   target_link_options(qmc_openmp_cxx INTERFACE "${OPENMP_CXX_LINK_OPTIONS}")
24 | endif()
25 | 
26 | check_cxx_source_compiles(
27 | "#include <omp.h>
28 | int main()
29 | { int a = omp_target_is_present(nullptr, 0); }"
30 | CXX_OFFLOAD_RUNTIME_OKAY
31 | )
32 | 
33 | if (CXX_OFFLOAD_RUNTIME_OKAY)
34 |   message(STATUS "CXX compiler has OpenMP offload runtime library.")
35 | else()
36 |   message(STATUS "CXX compiler doesn't have OpenMP offload runtime library.")
37 | endif()
38 | 


--------------------------------------------------------------------------------
/cmake/TestFortranOpenMPRuntime.cmake:
--------------------------------------------------------------------------------
 1 | set(TEST_OPENMP_RUNTIME_SOURCE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/try_openmp_runtime.f90)
 2 | file(WRITE ${TEST_OPENMP_RUNTIME_SOURCE}
 3 | "program test_open_runtime
 4 |   use omp_lib
 5 |   implicit none
 6 |   integer :: num
 7 |   num = omp_get_thread_num()
 8 | end program
 9 | ")
10 | 
11 | 
12 | try_compile(Fortran_OPENMP_RUNTIME_OKAY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
13 |             ${TEST_OPENMP_RUNTIME_SOURCE}
14 |             OUTPUT_VARIABLE COMPILE_OUTPUT)
15 | 
16 | if (NOT Fortran_OPENMP_RUNTIME_OKAY)
17 |   set(COMPILE_FAIL_OUTPUT fortran_openmp_runtime_compile_fail.txt)
18 |   file(WRITE "${CMAKE_BINARY_DIR}/${COMPILE_FAIL_OUTPUT}" "${COMPILE_OUTPUT}")
19 |   message(STATUS "Fortran OpenMP functionality check failed!"
20 |                  "See compiler output at ${COMPILE_FAIL_OUTPUT}")
21 |   add_library(dummy_openmp_runtime cmake/DummyOpenMPRuntime.f90)
22 |   fix_fortran_modules(dummy_openmp_runtime)
23 | else()
24 |   add_library(dummy_openmp_runtime INTERFACE)
25 |   message(STATUS "Fortran OpenMP functionality check pass")
26 | endif()
27 | 


--------------------------------------------------------------------------------
/hands-on/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(gemv)
2 | 


--------------------------------------------------------------------------------
/hands-on/README.md:
--------------------------------------------------------------------------------
1 | Before building any exectuable, create make.inc under hands-on based on your machine.
2 | Use make.clang-ykt.inc as an example.
3 | 


--------------------------------------------------------------------------------
/hands-on/cleanup.sh:
--------------------------------------------------------------------------------
1 | find -name "*.x" -exec rm {} \;
2 | find -name "fetchnode.*" -exec rm {} \;
3 | 


--------------------------------------------------------------------------------
/hands-on/common/timer.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <string>
 4 | 
 5 | class Timer
 6 | {
 7 |   const std::chrono::time_point<std::chrono::system_clock> start;
 8 |   const std::string name;
 9 | 
10 | public:
11 |   Timer(const std::string& name_in): start(std::chrono::system_clock::now()), name(name_in) {};
12 |   ~Timer()
13 |   {
14 |     auto end = std::chrono::system_clock::now();
15 |     std::cout << "Function " << name
16 |               << " takes " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(end - start).count()
17 |               << " us" << std::endl;
18 |   }
19 | };
20 | 


--------------------------------------------------------------------------------
/hands-on/gemm/0-gemmNN-serial/0-gemmNN-serial.cpp:
--------------------------------------------------------------------------------
  1 | #define N 3000
  2 | #include "timer.h"
  3 | 
  4 | /*
  5 |   Multiplies two matrices of dimension n x n and passes back resulting matrix.
  6 |  */
  7 | template<typename T>                                     
  8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result)
  9 | {
 10 |   for (int row = 0; row < n; row++)                                              
 11 |     for (int col = 0; col < n; col++)
 12 |       {
 13 | 	const T* __restrict__ A_row = A + row * n;
 14 | 	T sum(0);
 15 | 	const T* __restrict__ B_col = B + col;
 16 | 	for(int i = 0; i < n; i++)
 17 | 	  {
 18 | 	    sum += A_row[i] * B_col[i * n];
 19 | 	  }
 20 | 	const int index = (row * n) + col;
 21 | 	result[index] = sum * alpha;
 22 |       } 
 23 | }
 24 | 
 25 | /*
 26 |   Prints 1 dimensional matrix of dimension n x n.
 27 |  */
 28 | template<typename T>
 29 | void printMatrix(int n, T* __restrict__ A)
 30 | {
 31 |   for(int i = 0; i < n * n; i++)
 32 |     std::cout << A[i];
 33 | }
 34 | 
 35 | /*
 36 |   Creates 1 dimensional matrix of size n and fills with T(1).
 37 |  */
 38 | template<class T>
 39 | T* allocate(size_t n)
 40 | {                                                                                                     
 41 |   T* ptr = new T[n];                                                                     
 42 |   std::fill_n(ptr, n, T(1));                                            
 43 |   return ptr;                                                                                        
 44 | }
 45 | 
 46 | /*
 47 |   Frees up space from 1 dimensional matrix.
 48 |  */
 49 | template<class T>
 50 | void deallocate(T* ptr, size_t n)
 51 | {                                                                                                         
 52 |   delete[] ptr;
 53 | }
 54 | 
 55 | void testtbt()
 56 | {
 57 | std::cout << "Testing 3x3 matrix multiplication.\n";
 58 |   int dim = 3;
 59 |   auto* C = allocate<float>(dim * dim);
 60 |   auto* D = allocate<float>(dim * dim);
 61 |   auto* R = allocate<float>(dim * dim);
 62 |   std::cout << "Result calculated by hand: 010202010\n";
 63 |   for(int i = 0; i < dim * dim; i++)
 64 |     {
 65 |       if( i % 2 == 0)
 66 |         {
 67 |           C[i] = 0;
 68 |           D[i] = 1;
 69 |         } else
 70 |         {
 71 |           C[i] = 1;
 72 |           D[i] = 0;
 73 |         }
 74 |     }
 75 | 
 76 |   gemv(dim, 1.0f, C, D, R);
 77 | 
 78 |   std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n";
 79 |   std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n";
 80 |   std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n";
 81 | 
 82 |   deallocate(C, dim * dim);
 83 |   deallocate(D, dim * dim);
 84 |   deallocate(R, dim * dim);
 85 | }
 86 | 
 87 | int main()
 88 | {
 89 |   auto* A    = allocate<float>(N * N);                                                                        
 90 |   auto* B    = allocate<float>(N * N);                                                  
 91 |   auto* result = allocate<float>(N * N);
 92 | 
 93 |   // Debugging
 94 |   //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n";
 95 |   //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n";
 96 |     
 97 |   Timer local("GEMV");
 98 |   gemv(N, 1.0f, A, B, result);
 99 | 
100 |   // testtbt();
101 | 
102 |   // Debugging
103 |   //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n";
104 |    
105 |   deallocate(A, N * N);
106 |   deallocate(B, N * N);
107 |   deallocate(result, N * N);
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/hands-on/gemm/0-gemmNN-serial/Makefile:
--------------------------------------------------------------------------------
1 | name=0-gemmNN-serial
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/0-gemmNT-serial/Makefile:
--------------------------------------------------------------------------------
1 | name=gemmNT-serial
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -march=native -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/0-gemmNT-serial/gemmNT-serial.cpp:
--------------------------------------------------------------------------------
 1 | #define N 1024
 2 | #include "timer.h"
 3 | 
 4 | template <typename T>
 5 | void gemmT(int n, const T* restrict A, const T* restrict B, T* restrict C)
 6 | {
 7 |   for(int rowA=0; rowA<n; rowA++)
 8 |     for(int rowB=0; rowB<n; rowB++)
 9 |     {
10 |       T sum = T(0);
11 |       const T *restrict A_row = A+rowA*n;
12 |       const T *restrict B_row = B+rowB*n;
13 |       for(int colA=0; colA<n; colA++)
14 |         sum += A_row[colA]*B_row[colA];
15 |       C[rowA*n+rowB] = sum;
16 |     }
17 | }
18 | 
19 | template <class T>
20 | T* allocate(int n)
21 | {
22 |   T* ptr = new T[n];
23 |   std::fill_n(ptr, n, T(1));
24 |   return ptr;
25 | }
26 | 
27 | template <class T>
28 | void deallocate(T* ptr, int n)
29 | {
30 |   delete[] ptr;
31 | }
32 | 
33 | int main()
34 | {
35 |   auto* A = allocate<float>(N*N);
36 |   auto* B = allocate<float>(N*N);
37 |   auto* C = allocate<float>(N*N);
38 | 
39 |   {
40 |     Timer local("GEMMT");
41 |     gemmT(N, A, B, C);
42 |   }
43 | 
44 |   deallocate(A, N*N);
45 |   deallocate(B, N*N);
46 |   deallocate(C, N*N);
47 | }
48 | 


--------------------------------------------------------------------------------
/hands-on/gemm/1-gemmNN-omp-thread/1-gemmNN-omp-thread.cpp:
--------------------------------------------------------------------------------
  1 | #define N 3000
  2 | #include "timer.h"
  3 | 
  4 | /*
  5 |   Multiplies two matrices of dimension n x n and passes back resulting matrix.
  6 |  */
  7 | template<typename T>                                     
  8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result)
  9 | {
 10 | #pragma omp parallel for collapse(2)
 11 |   // target works with teams and map to offload data to GPU
 12 |   // teams distribute breaks execution of loops into teams of threads
 13 |   // map:to offloads data to GPU
 14 |   // map:from writes date from GPU to devide
 15 |   for (int row = 0; row < n; row++)                                              
 16 |     for (int col = 0; col < n; col++)
 17 |       {
 18 | 	const T* __restrict__ A_row = A + row * n;
 19 | 	T sum(0);
 20 | 	const T* __restrict__ B_col = B + col;
 21 | 	// can move pragma for here
 22 | 	for(int i = 0; i < n; i++)
 23 | 	  {
 24 | 	    sum += A_row[i] * B_col[i * n];
 25 | 	  }
 26 | 	const int index = (row * n) + col;
 27 | 	result[index] = sum * alpha;
 28 |       } 
 29 | }
 30 | 
 31 | /*
 32 |   Prints 1 dimensional matrix of dimension n x n.
 33 |  */
 34 | template<typename T>
 35 | void printMatrix(int n, T* __restrict__ A)
 36 | {
 37 |   for(int i = 0; i < n * n; i++)
 38 |     std::cout << A[i];
 39 | }
 40 | 
 41 | /*
 42 |   Creates 1 dimensional matrix of size n and fills with T(1).
 43 |  */
 44 | template<class T>
 45 | T* allocate(size_t n)
 46 | {                                                                                                     
 47 |   T* ptr = new T[n];                                                                     
 48 |   std::fill_n(ptr, n, T(1));                                            
 49 |   return ptr;                                                                                        
 50 | }
 51 | 
 52 | /*
 53 |   Frees up space from 1 dimensional matrix.
 54 |  */
 55 | template<class T>
 56 | void deallocate(T* ptr, size_t n)
 57 | {                                                                                                         
 58 |   delete[] ptr;
 59 | }
 60 | 
 61 | void testtbt()
 62 | {
 63 | std::cout << "Testing 3x3 matrix multiplication.\n";
 64 |   int dim = 3;
 65 |   auto* C = allocate<float>(dim * dim);
 66 |   auto* D = allocate<float>(dim * dim);
 67 |   auto* R = allocate<float>(dim * dim);
 68 |   std::cout << "Result calculated by hand: 010202010\n";
 69 |   for(int i = 0; i < dim * dim; i++)
 70 |     {
 71 |       if( i % 2 == 0)
 72 |         {
 73 |           C[i] = 0;
 74 |           D[i] = 1;
 75 |         } else
 76 |         {
 77 |           C[i] = 1;
 78 |           D[i] = 0;
 79 |         }
 80 |     }
 81 | 
 82 |   gemv(dim, 1.0f, C, D, R);
 83 | 
 84 |   std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n";
 85 |   std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n";
 86 |   std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n";
 87 | 
 88 |   deallocate(C, dim * dim);
 89 |   deallocate(D, dim * dim);
 90 |   deallocate(R, dim * dim);
 91 | }
 92 | 
 93 | int main()
 94 | {
 95 |   auto* A    = allocate<float>(N * N);                                                                        
 96 |   auto* B    = allocate<float>(N * N);                                                  
 97 |   auto* result = allocate<float>(N * N);
 98 | 
 99 |   // Debugging
100 |   //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n";
101 |   //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n";
102 |     
103 |   Timer local("GEMV");
104 |   gemv(N, 1.0f, A, B, result);
105 | 
106 |   // testtbt();
107 | 
108 |   // Debugging
109 |   //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n";
110 |    
111 |   deallocate(A, N * N);
112 |   deallocate(B, N * N);
113 |   deallocate(result, N * N);
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/hands-on/gemm/1-gemmNN-omp-thread/Makefile:
--------------------------------------------------------------------------------
1 | name=1-gemmNN-omp-thread
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/1-gemmNT-omp/Makefile:
--------------------------------------------------------------------------------
1 | name=gemmNT-omp
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -march=native -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/1-gemmNT-omp/gemmNT-omp.cpp:
--------------------------------------------------------------------------------
 1 | #define N 1024
 2 | #include "timer.h"
 3 | 
 4 | template <typename T>
 5 | void gemmT(int n, const T* restrict A, const T* restrict B, T* restrict C)
 6 | {
 7 |   #pragma omp parallel for
 8 |   for(int rowA=0; rowA<n; rowA++)
 9 |     for(int rowB=0; rowB<n; rowB++)
10 |     {
11 |       T sum = T(0);
12 |       const T *restrict A_row = A+rowA*n;
13 |       const T *restrict B_row = B+rowB*n;
14 |       for(int colA=0; colA<n; colA++)
15 |         sum += A_row[colA]*B_row[colA];
16 |       C[rowA*n+rowB] = sum;
17 |     }
18 | }
19 | 
20 | template <class T>
21 | T* allocate(int n)
22 | {
23 |   T* ptr = new T[n];
24 |   std::fill_n(ptr, n, T(1));
25 |   return ptr;
26 | }
27 | 
28 | template <class T>
29 | void deallocate(T* ptr, int n)
30 | {
31 |   delete[] ptr;
32 | }
33 | 
34 | int main()
35 | {
36 |   auto* A = allocate<float>(N*N);
37 |   auto* B = allocate<float>(N*N);
38 |   auto* C = allocate<float>(N*N);
39 | 
40 |   {
41 |     Timer local("GEMMT");
42 |     gemmT(N, A, B, C);
43 |   }
44 | 
45 |   deallocate(A, N*N);
46 |   deallocate(B, N*N);
47 |   deallocate(C, N*N);
48 | }
49 | 


--------------------------------------------------------------------------------
/hands-on/gemm/2-gemmNN-omp-target/2-gemmNN-omp-target.cpp:
--------------------------------------------------------------------------------
  1 | #define N 3000
  2 | #include "timer.h"
  3 | 
  4 | /*
  5 |   Multiplies two matrices of dimension n x n and passes back resulting matrix.
  6 |  */
  7 | template<typename T>                                     
  8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result)
  9 | {
 10 | #pragma omp target teams distribute collapse(2) map(to:A[:n*n], B[:n*n]) map(from:result[:n*n])
 11 |   // target works with teams and map to offload data to GPU
 12 |   // teams distribute breaks execution of loops into teams of threads
 13 |   // map:to offloads data to GPU
 14 |   // map:from writes date from GPU to devide
 15 |   for (int row = 0; row < n; row++)                                              
 16 |     for (int col = 0; col < n; col++)
 17 |       {
 18 | 	const T* __restrict__ A_row = A + row * n;
 19 | 	T sum(0);
 20 | 	const T* __restrict__ B_col = B + col;
 21 | 	// can move pragma for here
 22 | #pragma omp parallel for reduction(+:sum)
 23 | 	for(int i = 0; i < n; i++)
 24 | 	  {
 25 | 	    sum += A_row[i] * B_col[i * n];
 26 | 	  }
 27 | 	const int index = (row * n) + col;
 28 | 	result[index] = sum * alpha;
 29 |       } 
 30 | }
 31 | 
 32 | /*
 33 |   Prints 1 dimensional matrix of dimension n x n.
 34 |  */
 35 | template<typename T>
 36 | void printMatrix(int n, T* __restrict__ A)
 37 | {
 38 |   for(int i = 0; i < n * n; i++)
 39 |     std::cout << A[i];
 40 | }
 41 | 
 42 | /*
 43 |   Creates 1 dimensional matrix of size n and fills with T(1).
 44 |  */
 45 | template<class T>
 46 | T* allocate(size_t n)
 47 | {                                                                                                     
 48 |   T* ptr = new T[n];                                                                     
 49 |   std::fill_n(ptr, n, T(1));                                            
 50 |   return ptr;                                                                                        
 51 | }
 52 | 
 53 | /*
 54 |   Frees up space from 1 dimensional matrix.
 55 |  */
 56 | template<class T>
 57 | void deallocate(T* ptr, size_t n)
 58 | {                                                                                                         
 59 |   delete[] ptr;
 60 | }
 61 | 
 62 | void testtbt()
 63 | {
 64 | std::cout << "Testing 3x3 matrix multiplication.\n";
 65 |   int dim = 3;
 66 |   auto* C = allocate<float>(dim * dim);
 67 |   auto* D = allocate<float>(dim * dim);
 68 |   auto* R = allocate<float>(dim * dim);
 69 |   std::cout << "Result calculated by hand: 010202010\n";
 70 |   for(int i = 0; i < dim * dim; i++)
 71 |     {
 72 |       if( i % 2 == 0)
 73 |         {
 74 |           C[i] = 0;
 75 |           D[i] = 1;
 76 |         } else
 77 |         {
 78 |           C[i] = 1;
 79 |           D[i] = 0;
 80 |         }
 81 |     }
 82 | 
 83 |   gemv(dim, 1.0f, C, D, R);
 84 | 
 85 |   std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n";
 86 |   std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n";
 87 |   std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n";
 88 | 
 89 |   deallocate(C, dim * dim);
 90 |   deallocate(D, dim * dim);
 91 |   deallocate(R, dim * dim);
 92 | }
 93 | 
 94 | int main()
 95 | {
 96 |   auto* A    = allocate<float>(N * N);                                                                        
 97 |   auto* B    = allocate<float>(N * N);                                                  
 98 |   auto* result = allocate<float>(N * N);
 99 | 
100 |   // Debugging
101 |   //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n";
102 |   //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n";
103 |     
104 |   Timer local("GEMV");
105 |   gemv(N, 1.0f, A, B, result);
106 | 
107 |   // testtbt();
108 | 
109 |   // Debugging
110 |   //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n";
111 |    
112 |   deallocate(A, N * N);
113 |   deallocate(B, N * N);
114 |   deallocate(result, N * N);
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/hands-on/gemm/2-gemmNN-omp-target/Makefile:
--------------------------------------------------------------------------------
1 | name=2-gemmNN-omp-target
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/2-gemmNT-omp-target/Makefile:
--------------------------------------------------------------------------------
1 | name=gemmNT-omp-target
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/2-gemmNT-omp-target/gemmNT-omp-target.cpp:
--------------------------------------------------------------------------------
 1 | #define N 1024
 2 | #include "timer.h"
 3 | 
 4 | template <typename T>
 5 | void gemmT(int n, const T* restrict A, const T* restrict B, T* restrict C)
 6 | {
 7 |   #pragma omp target teams distribute parallel for collapse(2) map(to:A[:n*n], B[:n*n])  map(tofrom:C[:n*n])
 8 |   for(int rowA=0; rowA<n; rowA++)
 9 |     for(int rowB=0; rowB<n; rowB++)
10 |     {
11 |       T sum = T(0);
12 |       const T *restrict A_row = A+rowA*n;
13 |       const T *restrict B_row = B+rowB*n;
14 |       for(int colA=0; colA<n; colA++)
15 |         sum += A_row[colA]*B_row[colA];
16 |       C[rowA*n+rowB] = sum;
17 |     }
18 | }
19 | 
20 | template <class T>
21 | T* allocate(int n)
22 | {
23 |   T* ptr = new T[n];
24 |   std::fill_n(ptr, n, T(1));
25 |   #pragma omp target enter data map(to:ptr[:n])
26 |   return ptr;
27 | }
28 | 
29 | template <class T>
30 | void deallocate(T* ptr, int n)
31 | {
32 |   #pragma omp target exit data map(delete:ptr[:n])
33 |   delete[] ptr;
34 | }
35 | 
36 | int main()
37 | {
38 |   auto* A = allocate<float>(N*N);
39 |   auto* B = allocate<float>(N*N);
40 |   auto* C = allocate<float>(N*N);
41 | 
42 |   {
43 |     Timer local("GEMMT");
44 |     gemmT(N, A, B, C);
45 |   }
46 | 
47 |   deallocate(A, N*N);
48 |   deallocate(B, N*N);
49 |   deallocate(C, N*N);
50 | }
51 | 


--------------------------------------------------------------------------------
/hands-on/gemm/gemmNN/Makefile:
--------------------------------------------------------------------------------
1 | name=gemmNN
2 | ${name}.x: ${name}.cpp
3 | 	clang++ -std=c++11 -Drestrict=__restrict__ -fopenmp -fopenmp-targets=nvptx64-nvida-cuda -Xopenmp-target -march=sm_60 -O3 -g -o $@ -I ../../common $<
4 | 
5 | .PHONY : clean
6 | clean :
7 | 	rm -f *.x
8 | 


--------------------------------------------------------------------------------
/hands-on/gemm/gemmNN/gemmNN.cpp:
--------------------------------------------------------------------------------
  1 | #define N 1000
  2 | #include "timer.h"
  3 | 
  4 | /*
  5 |   Multiplies two matrices of dimension n x n and passes back resulting matrix.
  6 |  */
  7 | template<typename T>                                     
  8 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ B, T* __restrict__ result)
  9 | {
 10 | #pragma omp target teams distribute collapse(2) map(to:A[:n*n], B[:n*n]) map(from:result[:n*n])
 11 |   // target works with teams and map to offload data to GPU
 12 |   // teams distribute breaks execution of loops into teams of threads
 13 |   // map:to offloads data to GPU
 14 |   // map:from writes date from GPU to devide
 15 |   for (int row = 0; row < n; row++)                                              
 16 |     for (int col = 0; col < n; col++)
 17 |       {
 18 | 	const T* __restrict__ A_row = A + row * n;
 19 | 	T sum(0);
 20 | 	const T* __restrict__ B_col = B + col;
 21 | 	// can move pragma for here
 22 | #pragma omp parallel for reduction(+:sum)
 23 | 	for(int i = 0; i < n; i++)
 24 | 	  {
 25 | 	    sum += A_row[i] * B_col[i * n];
 26 | 	  }
 27 | 	const int index = (row * n) + col;
 28 | 	result[index] = sum * alpha;
 29 |       } 
 30 | }
 31 | 
 32 | /*
 33 |   Prints 1 dimensional matrix of dimension n x n.
 34 |  */
 35 | template<typename T>
 36 | void printMatrix(int n, T* __restrict__ A)
 37 | {
 38 |   for(int i = 0; i < n * n; i++)
 39 |     std::cout << A[i];
 40 | }
 41 | 
 42 | /*
 43 |   Creates 1 dimensional matrix of size n and fills with T(1).
 44 |  */
 45 | template<class T>
 46 | T* allocate(size_t n)
 47 | {                                                                                                     
 48 |   T* ptr = new T[n];                                                                     
 49 |   std::fill_n(ptr, n, T(1));                                            
 50 |   return ptr;                                                                                        
 51 | }
 52 | 
 53 | /*
 54 |   Frees up space from 1 dimensional matrix.
 55 |  */
 56 | template<class T>
 57 | void deallocate(T* ptr, size_t n)
 58 | {                                                                                                         
 59 |   delete[] ptr;
 60 | }
 61 | 
 62 | void testtbt()
 63 | {
 64 | std::cout << "Testing 3x3 matrix multiplication.\n";
 65 |   int dim = 3;
 66 |   auto* C = allocate<float>(dim * dim);
 67 |   auto* D = allocate<float>(dim * dim);
 68 |   auto* R = allocate<float>(dim * dim);
 69 |   std::cout << "Result calculated by hand: 010202010\n";
 70 |   for(int i = 0; i < dim * dim; i++)
 71 |     {
 72 |       if( i % 2 == 0)
 73 |         {
 74 |           C[i] = 0;
 75 |           D[i] = 1;
 76 |         } else
 77 |         {
 78 |           C[i] = 1;
 79 |           D[i] = 0;
 80 |         }
 81 |     }
 82 | 
 83 |   gemv(dim, 1.0f, C, D, R);
 84 | 
 85 |   std::cout << "Matrix C: "; printMatrix(dim, C); std::cout << "\n";
 86 |   std::cout << "Matrix D: "; printMatrix(dim, D); std::cout << "\n";
 87 |   std::cout << "Matrix R: "; printMatrix(dim, R); std::cout << "\n";
 88 | 
 89 |   deallocate(C, dim * dim);
 90 |   deallocate(D, dim * dim);
 91 |   deallocate(R, dim * dim);
 92 | }
 93 | 
 94 | int main()
 95 | {
 96 |   auto* A    = allocate<float>(N * N);                                                                        
 97 |   auto* B    = allocate<float>(N * N);                                                  
 98 |   auto* result = allocate<float>(N * N);
 99 | 
100 |   // Debugging
101 |   //std::cout << "Matrix A: "; printMatrix(N, A); std::cout << "\n";
102 |   //std::cout << "Matrix B: "; printMatrix(N, B); std::cout << "\n";
103 |     
104 |   Timer local("GEMV");
105 |   gemv(N, 1.0f, A, B, result);
106 | 
107 |   // testtbt();
108 | 
109 |   // Debugging
110 |   //std::cout << "Matrix Result: "; printMatrix(N, result); std::cout << "\n";
111 |    
112 |   deallocate(A, N * N);
113 |   deallocate(B, N * N);
114 |   deallocate(result, N * N);
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/hands-on/gemv/01-gemv-omp/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/01-gemv-omp/gemv-omp.cpp:
--------------------------------------------------------------------------------
 1 | #define N 8192
 2 | #include "timer.h"
 3 | 
 4 | template<typename T>
 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 6 | {
 7 | #pragma omp parallel for
 8 |   for (int row = 0; row < n; row++)
 9 |   {
10 |     T sum                       = T(0);
11 |     const T* __restrict__ A_row = A + row * n;
12 |     for (int col = 0; col < n; col++)
13 |       sum += A_row[col] * V[col];
14 |     Vout[row] = sum * alpha;
15 |   }
16 | }
17 | 
18 | template<class T>
19 | T* allocate(size_t n)
20 | {
21 |   T* ptr = new T[n];
22 |   std::fill_n(ptr, n, T(1));
23 |   return ptr;
24 | }
25 | 
26 | template<class T>
27 | void deallocate(T* ptr, size_t n)
28 | {
29 |   delete[] ptr;
30 | }
31 | 
32 | int main()
33 | {
34 |   auto* A    = allocate<float>(N * N);
35 |   auto* V    = allocate<float>(N);
36 |   auto* Vout = allocate<float>(N);
37 | 
38 |   {
39 |     Timer local("GEMV");
40 |     gemv(N, 1.0f, A, V, Vout);
41 |   }
42 | 
43 |   for (int i = 0; i < N; i++)
44 |     if (Vout[i] != N)
45 |     {
46 |       std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl;
47 | #if defined(THROW_FAIL)
48 |       throw;
49 | #else
50 |       break;
51 | #endif
52 |     }
53 | 
54 |   deallocate(A, N * N);
55 |   deallocate(V, N);
56 |   deallocate(Vout, N);
57 | }
58 | 


--------------------------------------------------------------------------------
/hands-on/gemv/01-gemv-omp/gemv-omp.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:),V(:),Vout(:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: val
12 | 
13 | !!starts here
14 | call system_clock(ti,tk)
15 | allocate(A(1:N*N),stat=err)
16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
17 | allocate(V(1:N),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
19 | allocate(Vout(1:N),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
21 | 
22 | A(:) = 1.0
23 | V(:) = 1.0
24 | call gemv(N,alpha,A,V,Vout)
25 | do val=1,N
26 |    if (int(Vout(val)) .NE. N) then
27 |         write(*,*) "Value does not match at",val,int(Vout(val)) 
28 |    end if
29 | end do
30 | 
31 | 
32 | deallocate(A)
33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
34 | deallocate(V)
35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
36 | deallocate(Vout)
37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
38 | call system_clock(tj,tk)
39 | 
40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
41 | 
42 | stop
43 | end 
44 | 
45 | !-------------------------------------------------------
46 | subroutine gemv(nval,alpha,A,V,Vout)
47 | 
48 | USE OMP_LIB
49 | implicit none
50 | 
51 | integer:: row,col,A_row
52 | integer:: nval,tid
53 | real(8) :: alpha,sum_val
54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
55 | real(8),intent(out):: Vout(1:nval)
56 | 
57 | !$omp parallel do default(shared) private(tid,row,col,A_row,sum_val)
58 | do row=1,nval
59 |    !tid=OMP_GET_THREAD_NUM()
60 |    sum_val = 0.0
61 |    A_row =(row-1)*nval
62 |    !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval
63 |    do col=1,nval
64 |       sum_val = sum_val + A(A_row+col)*V(col)
65 |    end do
66 |    Vout(row) = sum_val * alpha
67 | end do
68 | !$omp end parallel do
69 | 
70 | end subroutine
71 | 


--------------------------------------------------------------------------------
/hands-on/gemv/02-gemv-omp-target/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/02-gemv-omp-target/gemv-omp-target.cpp:
--------------------------------------------------------------------------------
 1 | #define N 8192
 2 | #include "timer.h"
 3 | 
 4 | template<typename T>
 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 6 | {
 7 | #pragma omp target parallel for map(to : A[:n * n], V[:n]) map(from : Vout[:n])
 8 |   for (int row = 0; row < n; row++)
 9 |   {
10 |     T sum                       = T(0);
11 |     const T* __restrict__ A_row = A + row * n;
12 |     for (int col = 0; col < n; col++)
13 |       sum += A_row[col] * V[col];
14 |     Vout[row] = sum * alpha;
15 |   }
16 | }
17 | 
18 | template<class T>
19 | T* allocate(size_t n)
20 | {
21 |   T* ptr = new T[n];
22 |   std::fill_n(ptr, n, T(1));
23 |   return ptr;
24 | }
25 | 
26 | template<class T>
27 | void deallocate(T* ptr, size_t n)
28 | {
29 |   delete[] ptr;
30 | }
31 | 
32 | int main()
33 | {
34 |   auto* A    = allocate<float>(N * N);
35 |   auto* V    = allocate<float>(N);
36 |   auto* Vout = allocate<float>(N);
37 | 
38 |   {
39 |     Timer local("GEMV");
40 |     gemv(N, 1.0f, A, V, Vout);
41 |   }
42 | 
43 |   for (int i = 0; i < N; i++)
44 |     if (Vout[i] != N)
45 |     {
46 |       std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl;
47 | #if defined(THROW_FAIL)
48 |       throw;
49 | #else
50 |       break;
51 | #endif
52 |     }
53 | 
54 |   deallocate(A, N * N);
55 |   deallocate(V, N);
56 |   deallocate(Vout, N);
57 | }
58 | 


--------------------------------------------------------------------------------
/hands-on/gemv/02-gemv-omp-target/gemv-omp-target.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:),V(:),Vout(:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: val
12 | 
13 | !!starts here
14 | call system_clock(ti,tk)
15 | allocate(A(1:N*N),stat=err)
16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
17 | allocate(V(1:N),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
19 | allocate(Vout(1:N),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
21 | 
22 | A(:) = 1.0
23 | V(:) = 1.0
24 | call gemv(N,alpha,A,V,Vout)
25 | do val=1,N
26 |    if (int(Vout(val)) .NE. N) then
27 |         write(*,*) "Value does not match at",val,int(Vout(val)) 
28 |    end if
29 | end do
30 | 
31 | 
32 | deallocate(A)
33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
34 | deallocate(V)
35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
36 | deallocate(Vout)
37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
38 | call system_clock(tj,tk)
39 | 
40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
41 | 
42 | stop
43 | end 
44 | 
45 | !-------------------------------------------------------
46 | subroutine gemv(nval,alpha,A,V,Vout)
47 | 
48 | USE OMP_LIB
49 | implicit none
50 | 
51 | integer:: row,col,A_row
52 | integer:: nval,tid
53 | real(8) :: alpha,sum_val
54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
55 | real(8),intent(out):: Vout(1:nval)
56 | 
57 | !$omp target map(to:A,V) map(from:Vout)
58 | !$omp parallel do default(shared) private(tid,row,col,A_row,sum_val)
59 | do row=1,nval
60 |    !tid=OMP_GET_THREAD_NUM()
61 |    sum_val = 0.0
62 |    A_row =(row-1)*nval
63 |    !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval
64 |    do col=1,nval
65 |       sum_val = sum_val + A(A_row+col)*V(col)
66 |    end do
67 |    Vout(row) = sum_val * alpha
68 | end do
69 | !$omp end parallel do
70 | !$omp end target
71 | end subroutine
72 | 


--------------------------------------------------------------------------------
/hands-on/gemv/03-gemv-omp-target-teams/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-teams
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/03-gemv-omp-target-teams/gemv-omp-target-teams.cpp:
--------------------------------------------------------------------------------
 1 | #define N 8192
 2 | #include "timer.h"
 3 | 
 4 | template<typename T>
 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 6 | {
 7 | #pragma omp target teams distribute parallel for map(to : A[:n * n], V[:n]) map(from : Vout[:n])
 8 |   for (int row = 0; row < n; row++)
 9 |   {
10 |     T sum                       = T(0);
11 |     const T* __restrict__ A_row = A + row * n;
12 |     for (int col = 0; col < n; col++)
13 |       sum += A_row[col] * V[col];
14 |     Vout[row] = sum * alpha;
15 |   }
16 | }
17 | 
18 | template<class T>
19 | T* allocate(size_t n)
20 | {
21 |   T* ptr = new T[n];
22 |   std::fill_n(ptr, n, T(1));
23 |   //#pragma omp target enter data map(to:ptr[:n])
24 |   return ptr;
25 | }
26 | 
27 | template<class T>
28 | void deallocate(T* ptr, size_t n)
29 | {
30 |   //#pragma omp target exit data map(delete:ptr[:n])
31 |   delete[] ptr;
32 | }
33 | 
34 | int main()
35 | {
36 |   auto* A    = allocate<float>(N * N);
37 |   auto* V    = allocate<float>(N);
38 |   auto* Vout = allocate<float>(N);
39 | 
40 |   {
41 |     Timer local("GEMV");
42 |     gemv(N, 1.0f, A, V, Vout);
43 |   }
44 | 
45 |   for (int i = 0; i < N; i++)
46 |     if (Vout[i] != N)
47 |     {
48 |       std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl;
49 | #if defined(THROW_FAIL)
50 |       throw;
51 | #else
52 |       break;
53 | #endif
54 |     }
55 | 
56 |   deallocate(A, N * N);
57 |   deallocate(V, N);
58 |   deallocate(Vout, N);
59 | }
60 | 


--------------------------------------------------------------------------------
/hands-on/gemv/03-gemv-omp-target-teams/gemv-omp-target-teams.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:),V(:),Vout(:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: val
12 | 
13 | !!starts here
14 | call system_clock(ti,tk)
15 | allocate(A(1:N*N),stat=err)
16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
17 | allocate(V(1:N),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
19 | allocate(Vout(1:N),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
21 | 
22 | A(:) = 1.0
23 | V(:) = 1.0
24 | call gemv(N,alpha,A,V,Vout)
25 | do val=1,N
26 |    if (int(Vout(val)) .NE. N) then
27 |         write(*,*) "Value does not match at",val,int(Vout(val)) 
28 |    end if
29 | end do
30 | 
31 | 
32 | deallocate(A)
33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
34 | deallocate(V)
35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
36 | deallocate(Vout)
37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
38 | call system_clock(tj,tk)
39 | 
40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
41 | 
42 | stop
43 | end 
44 | 
45 | !-------------------------------------------------------
46 | subroutine gemv(nval,alpha,A,V,Vout)
47 | 
48 | USE OMP_LIB
49 | implicit none
50 | 
51 | integer:: row,col,A_row
52 | integer:: nval,tid
53 | real(8) :: alpha,sum_val
54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
55 | real(8),intent(out):: Vout(1:nval)
56 | 
57 | !$omp target teams distribute parallel do &
58 | !$omp map(to:A,V) map(from:Vout) &
59 | !$omp default(shared) private(tid,row,col,A_row,sum_val)
60 | do row=1,nval
61 |    !tid=OMP_GET_THREAD_NUM()
62 |    sum_val = 0.0
63 |    A_row =(row-1)*nval
64 |    !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval
65 |    do col=1,nval
66 |       sum_val = sum_val + A(A_row+col)*V(col)
67 |    end do
68 |    Vout(row) = sum_val * alpha
69 | end do
70 | !$omp end target teams distribute parallel do
71 | end subroutine
72 | 


--------------------------------------------------------------------------------
/hands-on/gemv/04-gemv-omp-target-reduction/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-reduction
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/04-gemv-omp-target-reduction/gemv-omp-target-reduction.cpp:
--------------------------------------------------------------------------------
 1 | #define N 8192
 2 | #include "timer.h"
 3 | 
 4 | template<typename T>
 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 6 | {
 7 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n])
 8 |   for (int row = 0; row < n; row++)
 9 |   {
10 |     T sum                       = T(0);
11 |     const T* __restrict__ A_row = A + row * n;
12 | #pragma omp parallel for reduction(+ : sum)
13 |     for (int col = 0; col < n; col++)
14 |       sum += A_row[col] * V[col];
15 |     Vout[row] = sum * alpha;
16 |   }
17 | }
18 | 
19 | template<class T>
20 | T* allocate(size_t n)
21 | {
22 |   T* ptr = new T[n];
23 |   std::fill_n(ptr, n, T(1));
24 | #pragma omp target enter data map(to : ptr[:n])
25 |   return ptr;
26 | }
27 | 
28 | template<class T>
29 | void deallocate(T* ptr, size_t n)
30 | {
31 | #pragma omp target exit data map(delete : ptr[:n])
32 |   delete[] ptr;
33 | }
34 | 
35 | int main()
36 | {
37 |   auto* A    = allocate<float>(N * N);
38 |   auto* V    = allocate<float>(N);
39 |   auto* Vout = allocate<float>(N);
40 | 
41 |   {
42 |     Timer local("GEMV");
43 |     gemv(N, 1.0f, A, V, Vout);
44 |   }
45 | 
46 | #pragma omp target update from(Vout[:N])
47 |   for (int i = 0; i < N; i++)
48 |     if (Vout[i] != N)
49 |     {
50 |       std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl;
51 | #if defined(THROW_FAIL)
52 |       throw;
53 | #else
54 |       break;
55 | #endif
56 |     }
57 | 
58 |   deallocate(A, N * N);
59 |   deallocate(V, N);
60 |   deallocate(Vout, N);
61 | }
62 | 


--------------------------------------------------------------------------------
/hands-on/gemv/04-gemv-omp-target-reduction/gemv-omp-target-reduction.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:),V(:),Vout(:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: val
12 | 
13 | !!starts here
14 | call system_clock(ti,tk)
15 | allocate(A(1:N*N),stat=err)
16 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
17 | allocate(V(1:N),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
19 | allocate(Vout(1:N),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
21 | 
22 | A(:) = 1.0
23 | V(:) = 1.0
24 | call gemv(N,alpha,A,V,Vout)
25 | do val=1,N
26 |    if (int(Vout(val)) .NE. N) then
27 |         write(*,*) "Value does not match at",val,int(Vout(val)) 
28 |    end if
29 | end do
30 | 
31 | 
32 | deallocate(A)
33 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
34 | deallocate(V)
35 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
36 | deallocate(Vout)
37 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
38 | call system_clock(tj,tk)
39 | 
40 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
41 | 
42 | stop
43 | end 
44 | 
45 | !-------------------------------------------------------
46 | subroutine gemv(nval,alpha,A,V,Vout)
47 | 
48 | USE OMP_LIB
49 | implicit none
50 | 
51 | integer:: row,col,A_row
52 | integer:: nval,tid
53 | real(8) :: alpha,sum_val
54 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
55 | real(8),intent(out):: Vout(1:nval)
56 | 
57 | !$omp target teams distribute map(to:A,V) map(from:Vout) private(sum_val)
58 | do row=1,nval
59 |    sum_val = 0.0
60 |    A_row =(row-1)*nval
61 |    !$omp parallel do reduction(+:sum_val)
62 |    do col=1,nval
63 |       sum_val = sum_val + A(A_row+col)*V(col)
64 |    end do
65 |    Vout(row) = sum_val * alpha
66 | end do
67 | !$omp end target teams distribute
68 | end subroutine
69 | 


--------------------------------------------------------------------------------
/hands-on/gemv/05-gemv-omp-target-split-parallel-for-reduction/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-split-parallel-for-reduction
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   #all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/05-gemv-omp-target-split-parallel-for-reduction/gemv-omp-target-split-parallel-for-reduction.cpp:
--------------------------------------------------------------------------------
 1 | #define N 8192
 2 | #include "timer.h"
 3 | 
 4 | template<typename T>
 5 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 6 | {
 7 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n])
 8 |   for (int row = 0; row < n; row++)
 9 |   {
10 |     T sum                       = T(0);
11 |     const T* __restrict__ A_row = A + row * n;
12 |     #pragma omp parallel
13 |     {
14 |       #pragma omp for reduction(+ : sum)
15 |       for (int col = 0; col < n; col++)
16 |         sum += A_row[col] * V[col];
17 |     }
18 |     Vout[row] = sum * alpha;
19 |   }
20 | }
21 | 
22 | template<class T>
23 | T* allocate(size_t n)
24 | {
25 |   T* ptr = new T[n];
26 |   std::fill_n(ptr, n, T(1));
27 | #pragma omp target enter data map(to : ptr[:n])
28 |   return ptr;
29 | }
30 | 
31 | template<class T>
32 | void deallocate(T* ptr, size_t n)
33 | {
34 | #pragma omp target exit data map(delete : ptr[:n])
35 |   delete[] ptr;
36 | }
37 | 
38 | int main()
39 | {
40 |   auto* A    = allocate<float>(N * N);
41 |   auto* V    = allocate<float>(N);
42 |   auto* Vout = allocate<float>(N);
43 | 
44 |   {
45 |     Timer local("GEMV");
46 |     gemv(N, 1.0f, A, V, Vout);
47 |   }
48 | 
49 | #pragma omp target update from(Vout[:N])
50 |   for (int i = 0; i < N; i++)
51 |     if (Vout[i] != N)
52 |     {
53 |       std::cerr << "Vout[" << i << "] != " << N << ", wrong value is " << Vout[i] << std::endl;
54 | #if defined(THROW_FAIL)
55 |       throw;
56 | #else
57 |       break;
58 | #endif
59 |     }
60 | 
61 |   deallocate(A, N * N);
62 |   deallocate(V, N);
63 |   deallocate(Vout, N);
64 | }
65 | 


--------------------------------------------------------------------------------
/hands-on/gemv/51-gemv-omp-many-matrices/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-many-matrices
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/51-gemv-omp-many-matrices/gemv-omp-many-matrices.cpp:
--------------------------------------------------------------------------------
 1 | #define N 4096
 2 | #include <vector>
 3 | #include "timer.h"
 4 | 
 5 | template<typename T>
 6 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 7 | {
 8 |   for (int row = 0; row < n; row++)
 9 |   {
10 |     T sum                       = T(0);
11 |     const T* __restrict__ A_row = A + row * n;
12 |     for (int col = 0; col < n; col++)
13 |       sum += A_row[col] * V[col];
14 |     Vout[row] = sum * alpha;
15 |   }
16 | }
17 | 
18 | template<class T>
19 | T* allocate(size_t n)
20 | {
21 |   T* ptr = new T[n];
22 |   std::fill_n(ptr, n, T(1));
23 |   return ptr;
24 | }
25 | 
26 | template<class T>
27 | void deallocate(T* ptr, size_t n)
28 | {
29 |   delete[] ptr;
30 | }
31 | 
32 | int main()
33 | {
34 |   std::vector<float*> manyA;
35 |   std::vector<float*> manyV;
36 |   std::vector<float*> manyVout;
37 | 
38 |   const int Num_calc = 8;
39 |   for (int i = 0; i < Num_calc; i++)
40 |   {
41 |     manyA.push_back(allocate<float>(N * N));
42 |     manyV.push_back(allocate<float>(N));
43 |     manyVout.push_back(allocate<float>(N));
44 |   }
45 | 
46 |   {
47 |     Timer local("multiGEMV");
48 | #pragma omp parallel for
49 |     for (int i = 0; i < Num_calc; i++)
50 |       gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]);
51 |   }
52 | 
53 |   for (int i = 0; i < Num_calc; i++)
54 |   {
55 |     for (int j = 0; j < N; j++)
56 |       if (manyVout[i][j] != N)
57 |       {
58 |         std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << manyVout[i][j]
59 |                   << std::endl;
60 | #if defined(THROW_FAIL)
61 |         throw;
62 | #else
63 |         break;
64 | #endif
65 |       }
66 | 
67 |     deallocate(manyA[i], N * N);
68 |     deallocate(manyV[i], N);
69 |     deallocate(manyVout[i], N);
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/hands-on/gemv/51-gemv-omp-many-matrices/gemv-omp-many-matrices.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:,:),V(:,:),Vout(:,:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: i,val
12 | integer,parameter :: Num_calc=8
13 | 
14 | !!starts here
15 | call system_clock(ti,tk)
16 | 
17 | allocate(A(1:N*N,1:Num_calc),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
19 | allocate(V(1:N,1:Num_calc),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
21 | allocate(Vout(1:N,1:Num_calc),stat=err)
22 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
23 |   
24 | 
25 | !$omp parallel do
26 | do i=1,Num_calc
27 |    A(:,i) = 1.0
28 |    V(:,i) = 1.0
29 |    call gemv(N,alpha,A(:,i),V(:,i),Vout(:,i))
30 | end do
31 | !$omp end parallel do
32 | 
33 | do i=1,Num_calc
34 |    do val=1,N
35 |      if (int(Vout(val,i)) .NE. N) then
36 |         write(*,*) "Value does not match at",val,i,int(Vout(val,i)) 
37 |      end if
38 |    end do
39 | end do
40 | 
41 | 
42 | deallocate(A)
43 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
44 | deallocate(V)
45 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
46 | deallocate(Vout)
47 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
48 | call system_clock(tj,tk)
49 | 
50 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
51 | 
52 | stop
53 | end 
54 | 
55 | !-------------------------------------------------------
56 | subroutine gemv(nval,alpha,A,V,Vout)
57 | 
58 | USE OMP_LIB
59 | implicit none
60 | 
61 | integer:: row,col,A_row
62 | integer:: nval,tid
63 | real(8) :: alpha,sum_val
64 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
65 | real(8),intent(out):: Vout(1:nval)
66 | 
67 | do row=1,nval
68 |    !tid=OMP_GET_THREAD_NUM()
69 |    sum_val = 0.0
70 |    A_row =(row-1)*nval
71 |    !write(*,*) "total number of threads: ",tid,A_row,A_row+nval,nval*nval
72 |    do col=1,nval
73 |       sum_val = sum_val + A(A_row+col)*V(col)
74 |    end do
75 |    Vout(row) = sum_val * alpha
76 | end do
77 | end subroutine
78 | 


--------------------------------------------------------------------------------
/hands-on/gemv/52-gemv-omp-target-many-matrices-no-hierachy/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-many-matrices-no-hierachy
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/52-gemv-omp-target-many-matrices-no-hierachy/gemv-omp-target-many-matrices-no-hierachy.cpp:
--------------------------------------------------------------------------------
 1 | #define N 4096
 2 | #include <vector>
 3 | #include "timer.h"
 4 | 
 5 | template<typename T>
 6 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 7 | {
 8 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n])
 9 |   for (int row = 0; row < n; row++)
10 |   {
11 |     T sum                       = T(0);
12 |     const T* __restrict__ A_row = A + row * n;
13 |     for (int col = 0; col < n; col++)
14 |       sum += A_row[col] * V[col];
15 |     Vout[row] = sum * alpha;
16 |   }
17 | }
18 | 
19 | template<class T>
20 | T* allocate(size_t n)
21 | {
22 |   T* ptr = new T[n];
23 |   std::fill_n(ptr, n, T(1));
24 | #pragma omp target enter data map(to : ptr[:n])
25 |   return ptr;
26 | }
27 | 
28 | template<class T>
29 | void deallocate(T* ptr, size_t n)
30 | {
31 | #pragma omp target exit data map(delete : ptr[:n])
32 |   delete[] ptr;
33 | }
34 | 
35 | int main()
36 | {
37 |   std::vector<float*> manyA;
38 |   std::vector<float*> manyV;
39 |   std::vector<float*> manyVout;
40 | 
41 |   const int Num_calc = 8;
42 |   for (int i = 0; i < Num_calc; i++)
43 |   {
44 |     manyA.push_back(allocate<float>(N * N));
45 |     manyV.push_back(allocate<float>(N));
46 |     manyVout.push_back(allocate<float>(N));
47 |   }
48 | 
49 |   {
50 |     Timer local("multiGEMV");
51 | #pragma omp parallel for
52 |     for (int i = 0; i < Num_calc; i++)
53 |       gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]);
54 |   }
55 | 
56 |   for (int i = 0; i < Num_calc; i++)
57 |   {
58 |     auto* __restrict__ Vout = manyVout[i];
59 | #pragma omp target update from(Vout[:N])
60 |     for (int j = 0; j < N; j++)
61 |       if (Vout[j] != N)
62 |       {
63 |         std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j]
64 |                   << std::endl;
65 | #if defined(THROW_FAIL)
66 |         throw;
67 | #else
68 |         break;
69 | #endif
70 |       }
71 | 
72 |     deallocate(manyA[i], N * N);
73 |     deallocate(manyV[i], N);
74 |     deallocate(manyVout[i], N);
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/hands-on/gemv/52-gemv-omp-target-many-matrices-no-hierachy/gemv-omp-target-many-matrices-no-hierachy.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:,:),V(:,:),Vout(:,:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: i,val
12 | integer,parameter :: Num_calc=8
13 | 
14 | !!starts here
15 | call system_clock(ti,tk)
16 | 
17 | allocate(A(1:N*N,1:Num_calc),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
19 | allocate(V(1:N,1:Num_calc),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
21 | allocate(Vout(1:N,1:Num_calc),stat=err)
22 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
23 |   
24 | 
25 | !$omp parallel do
26 | do i=1,Num_calc
27 |    A(:,i) = 1.0
28 |    V(:,i) = 1.0
29 |    call gemv(N,alpha,A(:,i),V(:,i),Vout(:,i))
30 | end do
31 | !$omp end parallel do
32 | 
33 | do i=1,Num_calc
34 | !$omp target update from(Vout(:,i))
35 |    do val=1,N
36 |      if (int(Vout(val,i)) .NE. N) then
37 |         write(*,*) "Value does not match at",val,i,int(Vout(val,i)) 
38 |      end if
39 |    end do
40 | end do
41 | 
42 | 
43 | deallocate(A)
44 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
45 | deallocate(V)
46 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
47 | deallocate(Vout)
48 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
49 | call system_clock(tj,tk)
50 | 
51 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
52 | 
53 | stop
54 | end 
55 | 
56 | !-------------------------------------------------------
57 | subroutine gemv(nval,alpha,A,V,Vout)
58 | 
59 | USE OMP_LIB
60 | implicit none
61 | 
62 | integer:: row,col,A_row
63 | integer:: nval,tid
64 | real(8) :: alpha,sum_val
65 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
66 | real(8),intent(out):: Vout(1:nval)
67 | 
68 | !$omp target teams distribute map(to:A,V) map(from:Vout)
69 | do row=1,nval
70 |    sum_val = 0.0
71 |    A_row =(row-1)*nval
72 |    do col=1,nval
73 |       sum_val = sum_val + A(A_row+col)*V(col)
74 |    end do
75 |    Vout(row) = sum_val * alpha
76 | end do
77 | !$omp end target teams distribute
78 | end subroutine
79 | 


--------------------------------------------------------------------------------
/hands-on/gemv/53-gemv-omp-target-many-matrices/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-many-matrices
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/53-gemv-omp-target-many-matrices/gemv-omp-target-many-matrices.cpp:
--------------------------------------------------------------------------------
 1 | #define N 4096
 2 | #include <vector>
 3 | #include "timer.h"
 4 | 
 5 | template<typename T>
 6 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 7 | {
 8 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n])
 9 |   for (int row = 0; row < n; row++)
10 |   {
11 |     T sum                       = T(0);
12 |     const T* __restrict__ A_row = A + row * n;
13 | #pragma omp parallel for reduction(+ : sum)
14 |     for (int col = 0; col < n; col++)
15 |       sum += A_row[col] * V[col];
16 |     Vout[row] = sum * alpha;
17 |   }
18 | }
19 | 
20 | template<class T>
21 | T* allocate(size_t n)
22 | {
23 |   T* ptr = new T[n];
24 |   std::fill_n(ptr, n, T(1));
25 | #pragma omp target enter data map(to : ptr[:n])
26 |   return ptr;
27 | }
28 | 
29 | template<class T>
30 | void deallocate(T* ptr, size_t n)
31 | {
32 | #pragma omp target exit data map(delete : ptr[:n])
33 |   delete[] ptr;
34 | }
35 | 
36 | int main()
37 | {
38 |   std::vector<float*> manyA;
39 |   std::vector<float*> manyV;
40 |   std::vector<float*> manyVout;
41 | 
42 |   const int Num_calc = 8;
43 |   for (int i = 0; i < Num_calc; i++)
44 |   {
45 |     manyA.push_back(allocate<float>(N * N));
46 |     manyV.push_back(allocate<float>(N));
47 |     manyVout.push_back(allocate<float>(N));
48 |   }
49 | 
50 |   {
51 |     Timer local("multiGEMV");
52 | #pragma omp parallel for
53 |     for (int i = 0; i < Num_calc; i++)
54 |       gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]);
55 |   }
56 | 
57 |   for (int i = 0; i < Num_calc; i++)
58 |   {
59 |     auto* __restrict__ Vout = manyVout[i];
60 | #pragma omp target update from(Vout[:N])
61 |     for (int j = 0; j < N; j++)
62 |       if (Vout[j] != N)
63 |       {
64 |         std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j]
65 |                   << std::endl;
66 | #if defined(THROW_FAIL)
67 |         throw;
68 | #else
69 |         break;
70 | #endif
71 |       }
72 | 
73 |     deallocate(manyA[i], N * N);
74 |     deallocate(manyV[i], N);
75 |     deallocate(manyVout[i], N);
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/hands-on/gemv/53-gemv-omp-target-many-matrices/gemv-omp-target-many-matrices.f90:
--------------------------------------------------------------------------------
 1 | program gemv_omp
 2 | 
 3 | USE OMP_LIB
 4 | 
 5 | implicit none
 6 | integer,parameter:: N=64
 7 | real(8),allocatable :: A(:,:),V(:,:),Vout(:,:)
 8 | real(8) :: alpha=1.0
 9 | integer :: ti,tj,tk
10 | integer :: err
11 | integer :: i,val
12 | integer,parameter :: Num_calc=8
13 | 
14 | !!starts here
15 | call system_clock(ti,tk)
16 | 
17 | allocate(A(1:N*N,1:Num_calc),stat=err)
18 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for A',err
19 | allocate(V(1:N,1:Num_calc),stat=err)
20 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for V',err
21 | allocate(Vout(1:N,1:Num_calc),stat=err)
22 | if(err/=0) print'(a30,i9,i3)', 'ERROR in allocation for Vout',err
23 |   
24 | 
25 | !$omp parallel do
26 | do i=1,Num_calc
27 |    A(:,i) = 1.0
28 |    V(:,i) = 1.0
29 |    call gemv(N,alpha,A(:,i),V(:,i),Vout(:,i))
30 | end do
31 | !$omp end parallel do
32 | 
33 | do i=1,Num_calc
34 | !$omp target update from(Vout(:,i))
35 |    do val=1,N
36 |      if (int(Vout(val,i)) .NE. N) then
37 |         write(*,*) "Value does not match at",val,i,int(Vout(val,i)) 
38 |      end if
39 |    end do
40 | end do
41 | 
42 | 
43 | deallocate(A)
44 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for A',err
45 | deallocate(V)
46 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for V',err
47 | deallocate(Vout)
48 | if(err/=0) print'(a30,i9,i3)', 'ERROR in deallocation for Vout',err
49 | call system_clock(tj,tk)
50 | 
51 | print'(a20,3x,f12.4)',"total time: ", dble(tj-ti)/dble(tk)
52 | 
53 | stop
54 | end 
55 | 
56 | !-------------------------------------------------------
57 | subroutine gemv(nval,alpha,A,V,Vout)
58 | 
59 | USE OMP_LIB
60 | implicit none
61 | 
62 | integer:: row,col,A_row
63 | integer:: nval,tid
64 | real(8) :: alpha,sum_val
65 | real(8),intent(in) :: A(1:nval*nval),V(1:nval)
66 | real(8),intent(out):: Vout(1:nval)
67 | 
68 | !$omp target teams distribute map(to:A,V) map(from:Vout)
69 | do row=1,nval
70 |    sum_val = 0.0
71 |    A_row =(row-1)*nval
72 | !$omp parallel do default(shared) private(A_row) reduction(+:sum_val)
73 |    do col=1,nval
74 |       sum_val = sum_val + A(A_row+col)*V(col)
75 |    end do
76 |    Vout(row) = sum_val * alpha
77 | end do
78 | !$omp end target teams distribute
79 | end subroutine
80 | 


--------------------------------------------------------------------------------
/hands-on/gemv/54-gemv-omp-target-many-matrices-multi-devices/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-many-matrices-multi-devices
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   #all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/54-gemv-omp-target-many-matrices-multi-devices/gemv-omp-target-many-matrices-multi-devices.cpp:
--------------------------------------------------------------------------------
 1 | #define N 4096
 2 | #include <vector>
 3 | #include "timer.h"
 4 | #if defined(_OPEMMP)
 5 | #include <omp.h>
 6 | #endif
 7 | 
 8 | template<typename T>
 9 | void gemv(int deviceID, int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
10 | {
11 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) device(deviceID)
12 |   for (int row = 0; row < n; row++)
13 |   {
14 |     T sum                       = T(0);
15 |     const T* __restrict__ A_row = A + row * n;
16 | #pragma omp parallel for reduction(+ : sum)
17 |     for (int col = 0; col < n; col++)
18 |       sum += A_row[col] * V[col];
19 |     Vout[row] = sum * alpha;
20 |   }
21 | }
22 | 
23 | template<class T>
24 | T* allocate(int deviceID, size_t n)
25 | {
26 |   T* ptr = new T[n];
27 |   std::fill_n(ptr, n, T(1));
28 | #pragma omp target enter data map(to : ptr[:n]) device(deviceID)
29 |   return ptr;
30 | }
31 | 
32 | template<class T>
33 | void deallocate(int deviceID, T* ptr, size_t n)
34 | {
35 | #pragma omp target exit data map(delete : ptr[:n]) device(deviceID)
36 |   delete[] ptr;
37 | }
38 | 
39 | int main()
40 | {
41 | #if defined(_OPEMMP)
42 |   const int num_devices = omp_get_num_devices();
43 | #else
44 |   const int num_devices = 1;
45 | #endif
46 |   std::cout << "Found " << num_devices << " devices." << std::endl;
47 | 
48 |   std::vector<float*> manyA;
49 |   std::vector<float*> manyV;
50 |   std::vector<float*> manyVout;
51 | 
52 |   const int Num_calc = 8;
53 |   for (int i = 0; i < Num_calc; i++)
54 |   {
55 |     manyA.push_back(allocate<float>(i % num_devices, N * N));
56 |     manyV.push_back(allocate<float>(i % num_devices, N));
57 |     manyVout.push_back(allocate<float>(i % num_devices, N));
58 |   }
59 | 
60 |   {
61 |     Timer local("multiGEMV");
62 | #pragma omp parallel for
63 |     for (int i = 0; i < Num_calc; i++)
64 |       gemv(i % num_devices, N, 1.0f, manyA[i], manyV[i], manyVout[i]);
65 |   }
66 | 
67 |   for (int i = 0; i < Num_calc; i++)
68 |   {
69 |     auto* __restrict__ Vout = manyVout[i];
70 | #pragma omp target update from(Vout[:N]) device(i % num_devices)
71 |     for (int j = 0; j < N; j++)
72 |       if (Vout[j] != N)
73 |       {
74 |         std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j]
75 |                   << std::endl;
76 | #if defined(THROW_FAIL)
77 |         throw;
78 | #else
79 |         break;
80 | #endif
81 |       }
82 | 
83 |     deallocate(i % num_devices, manyA[i], N * N);
84 |     deallocate(i % num_devices, manyV[i], N);
85 |     deallocate(i % num_devices, manyVout[i], N);
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/hands-on/gemv/55-gemv-omp-target-many-matrices-taskloop/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../make.inc
 2 | 
 3 | name=gemv-omp-target-many-matrices-taskloop
 4 | 
 5 | all_targets=
 6 | ifdef ENABLE_C
 7 |   all_targets += ${name}.c.x
 8 | endif
 9 | ifdef ENABLE_CXX
10 |   all_targets += ${name}.cpp.x
11 | endif
12 | ifdef ENABLE_FC
13 |   all_targets += ${name}.f.x
14 | endif
15 | 
16 | all: ${all_targets}
17 | 
18 | ${name}.cpp.x: ${name}.cpp
19 | 	${CXX} -std=c++11 ${CXX_FLAGS} ${CXX_OFFLOAD_FLAGS} -o $@ -I ../../common $< && ./$@
20 | 
21 | ${name}.f.x: ${name}.f90
22 | 	${FC}  ${FC_FLAGS} ${FC_OFFLOAD_FLAGS}  -o $@ $< && ./$@
23 | 
24 | .PHONY : clean
25 | clean :
26 | 	rm -f *.x
27 | 


--------------------------------------------------------------------------------
/hands-on/gemv/55-gemv-omp-target-many-matrices-taskloop/gemv-omp-target-many-matrices-taskloop.cpp:
--------------------------------------------------------------------------------
  1 | #define N 256
  2 | #include <vector>
  3 | #include <sstream>
  4 | #include "timer.h"
  5 | 
  6 | template<typename T>
  7 | void gemv(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
  8 | {
  9 | #pragma omp target teams distribute map(to : A[:n * n], V[:n]) map(from : Vout[:n]) nowait
 10 |   for (int row = 0; row < n; row++)
 11 |   {
 12 |     T sum                       = T(0);
 13 |     const T* __restrict__ A_row = A + row * n;
 14 | #pragma omp parallel for reduction(+ : sum)
 15 |     for (int col = 0; col < n; col++)
 16 |       sum += A_row[col] * V[col];
 17 |     Vout[row] = sum * alpha;
 18 |   }
 19 | }
 20 | 
 21 | template<typename T>
 22 | void gemv_host(int n, T alpha, const T* __restrict__ A, const T* __restrict__ V, T* __restrict__ Vout)
 23 | {
 24 |   for (int row = 0; row < n; row++)
 25 |   {
 26 |     T sum                       = T(0);
 27 |     const T* __restrict__ A_row = A + row * n;
 28 |     for (int col = 0; col < n; col++)
 29 |       sum += A_row[col] * V[col];
 30 |     Vout[row] = sum * alpha;
 31 |   }
 32 | }
 33 | 
 34 | template<class T>
 35 | T* allocate(size_t n)
 36 | {
 37 |   T* ptr = new T[n];
 38 |   std::fill_n(ptr, n, T(1));
 39 | #pragma omp target enter data map(to : ptr[:n])
 40 |   return ptr;
 41 | }
 42 | 
 43 | template<class T>
 44 | void deallocate(T* ptr, size_t n)
 45 | {
 46 | #pragma omp target exit data map(delete : ptr[:n])
 47 |   delete[] ptr;
 48 | }
 49 | 
 50 | int main()
 51 | {
 52 |   std::vector<float*> manyA;
 53 |   std::vector<float*> manyV;
 54 |   std::vector<float*> manyVout;
 55 | 
 56 |   const int Num_calc = 512;
 57 |   for (int i = 0; i < Num_calc; i++)
 58 |   {
 59 |     manyA.push_back(allocate<float>(N * N));
 60 |     manyV.push_back(allocate<float>(N));
 61 |     manyVout.push_back(allocate<float>(N));
 62 |   }
 63 | 
 64 |   // warm up
 65 | #pragma omp parallel
 66 |   {
 67 | #pragma omp target nowait
 68 |     { int a = 1; }
 69 |   }
 70 | 
 71 |   {
 72 |     Timer local("multiGEMV parallel taskloop");
 73 | #pragma omp parallel
 74 |     #pragma omp single
 75 |     #pragma omp taskloop
 76 |     for (int i = 0; i < Num_calc; i++)
 77 |       if (i%2)
 78 |         gemv(N, 1.0f, manyA[i], manyV[i], manyVout[i]);
 79 |       else
 80 |         gemv_host(N/16, 1.0f, manyA[i], manyV[i], manyVout[i]);
 81 |   }
 82 | 
 83 |   for (int i = 0; i < Num_calc; i++)
 84 |   {
 85 |     auto* __restrict__ Vout = manyVout[i];
 86 |     if (i%2)
 87 |     {
 88 | #pragma omp target update from(Vout[:N])
 89 |     for (int j = 0; j < N; j++)
 90 |       if (Vout[j] != N)
 91 |       {
 92 |         std::cerr << "Calculation " << i << " Vout[" << j << "] != " << N << ", wrong value is " << Vout[j]
 93 |                   << std::endl;
 94 | #if defined(THROW_FAIL)
 95 |         throw;
 96 | #else
 97 |         break;
 98 | #endif
 99 |       }
100 |     }
101 | 
102 |     deallocate(manyA[i], N * N);
103 |     deallocate(manyV[i], N);
104 |     deallocate(manyVout[i], N);
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/hands-on/gemv/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   set(CXX_SRCS
 3 |     01-gemv-omp/gemv-omp.cpp
 4 |     02-gemv-omp-target/gemv-omp-target.cpp
 5 |     03-gemv-omp-target-teams/gemv-omp-target-teams.cpp
 6 |     04-gemv-omp-target-reduction/gemv-omp-target-reduction.cpp
 7 |     05-gemv-omp-target-split-parallel-for-reduction/gemv-omp-target-split-parallel-for-reduction.cpp
 8 |     51-gemv-omp-many-matrices/gemv-omp-many-matrices.cpp
 9 |     52-gemv-omp-target-many-matrices-no-hierachy/gemv-omp-target-many-matrices-no-hierachy.cpp
10 |     53-gemv-omp-target-many-matrices/gemv-omp-target-many-matrices.cpp
11 |     54-gemv-omp-target-many-matrices-multi-devices/gemv-omp-target-many-matrices-multi-devices.cpp
12 |     55-gemv-omp-target-many-matrices-taskloop/gemv-omp-target-many-matrices-taskloop.cpp
13 |   )
14 | 
15 |   foreach(full_file_name IN ITEMS ${CXX_SRCS})
16 |     get_filename_component(name_only ${full_file_name} NAME_WE)
17 |     set(EXE_NAME cxx.${name_only})
18 |     add_executable(${EXE_NAME} ${full_file_name})
19 |     target_compile_definitions(${EXE_NAME} PUBLIC -DTHROW_FAIL)
20 |     target_include_directories(${EXE_NAME} PUBLIC ../common)
21 |     add_test(NAME ${EXE_NAME}
22 |              COMMAND $<TARGET_FILE:${EXE_NAME}>)
23 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
24 |   endforeach()
25 | endif()
26 | 


--------------------------------------------------------------------------------
/hands-on/gemv/README:
--------------------------------------------------------------------------------
 1 | Remeber to create a make.inc under hands-on
 2 | Use make.clang-ykt.inc as example
 3 | 
 4 | A sgemv example
 5 | 
 6 | Step 1
 7 | omp parallel for
 8 | 
 9 | Step 2
10 | omp target parallel for
11 | using 1 SM
12 | 
13 | Step 3
14 | omp target teams distribute parallel for
15 | using all possible SMs
16 | 
17 | use enter/exit data
18 | 
19 | Step 4
20 | omp target
21 | with reduction
22 | 
23 | Step 5
24 | omp parallel for
25 | over many matrices
26 | 
27 | Step 6
28 | omp parallel for + target teams distribute
29 | over many matrices to GPU
30 | 
31 | Step 7
32 | omp parallel for + target teams distribute + parallel for
33 | over many matrices to GPU
34 | 


--------------------------------------------------------------------------------
/hands-on/gemv/build_and_run_all.sh:
--------------------------------------------------------------------------------
 1 | for folder in [0-9]-*
 2 | do
 3 |   cd $folder
 4 |   echo ------------ $folder ----------------
 5 |   make clean
 6 |   echo -- CXX --
 7 |   make ENABLE_CXX=1
 8 |   echo -- FC --
 9 |   make ENABLE_FC=1
10 |   echo
11 |   cd ..
12 | done
13 | 


--------------------------------------------------------------------------------
/hands-on/make.aomp.inc:
--------------------------------------------------------------------------------
1 | CXX=clang++
2 | CXX_FLAGS=-g -O3 -fopenmp
3 | CXX_OFFLOAD_FLAGS=-fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906
4 | 
5 | FC=flang
6 | FC_FLAGS=${CXX_FLAGS}
7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS}
8 | 


--------------------------------------------------------------------------------
/hands-on/make.gcc-nv.inc:
--------------------------------------------------------------------------------
1 | CXX=g++
2 | CXX_FLAGS=-g -O3 -fopenmp
3 | CXX_OFFLOAD_FLAGS=-foffload=nvptx-none
4 | 
5 | FC=gfortran
6 | FC_FLAGS=-fopenmp
7 | FC_OFFLOAD_FLAGS=-foffload=nvptx-none
8 | 


--------------------------------------------------------------------------------
/hands-on/make.icx.inc:
--------------------------------------------------------------------------------
1 | CXX=icpx
2 | CXX_FLAGS=-g -O2 -fiopenmp
3 | CXX_OFFLOAD_FLAGS=-fopenmp-targets=spir64
4 | 
5 | FC=ifx
6 | FC_FLAGS=${CXX_FLAGS}
7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS}
8 | 
9 | 


--------------------------------------------------------------------------------
/hands-on/make.llvm.inc:
--------------------------------------------------------------------------------
1 | CXX=clang++
2 | CXX_FLAGS=-g -O3 -fopenmp
3 | CXX_OFFLOAD_FLAGS=-fopenmp-targets=nvptx64
4 | 
5 | FC=flang
6 | FC_FLAGS=${CXX_FLAGS}
7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS}
8 | 


--------------------------------------------------------------------------------
/hands-on/make.xl.inc:
--------------------------------------------------------------------------------
1 | CXX=xlC_r
2 | CXX_FLAGS=-g -O3 -qsmp=omp
3 | CXX_OFFLOAD_FLAGS=-qoffload
4 | 
5 | FC=xlf90_r
6 | FC_FLAGS=${CXX_FLAGS}
7 | FC_OFFLOAD_FLAGS=${CXX_OFFLOAD_FLAGS}
8 | 


--------------------------------------------------------------------------------
/integration/crusher_recipe/README:
--------------------------------------------------------------------------------
 1 | WORKSPACE=`pwd`
 2 | 
 3 | # set up compiler wrappers
 4 | module load cmake/3.22.2
 5 | module load cray-fftw
 6 | module load openblas/0.3.17-omp
 7 | module load boost/1.77.0-cxx17
 8 | # private module until OLCF provides MPI compiler wrappers for afar compilers.
 9 | module use $WORKSPACE/modules
10 | module load mpiwrappers/cray-mpich-afar
11 | module load cray-hdf5-parallel
12 | 
13 | # clone qmcpack source
14 | git clone --shallow-since=2022-08-01 https://github.com/QMCPACK/qmcpack.git
15 | git co d029f1f2c976c39486b5122cd81566f93afb2461
16 | 
17 | # prepare test h5 files. Files are not small. Download once and soft link them when needed.
18 | mkdir -p $WORKSPACE/QMCDATA/NiO
19 | cd $WORKSPACE/QMCDATA/NiO
20 | curl -L -O -J https://m.box.com/file/284382973675/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c
21 | curl -L -O -J https://m.box.com/file/284381326469/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c
22 | curl -L -O -J https://m.box.com/file/284383098200/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c
23 | curl -L -O -J https://m.box.com/file/130886492400/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c
24 | curl -L -O -J https://m.box.com/file/130890136573/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c
25 | curl -L -O -J https://m.box.com/file/178686464361/download?shared_link=https%3A%2F%2Fanl.box.com%2Fs%2Fyxz1ic4kxtdtgpva5hcmlom9ixfl3v3c
26 | cd -
27 | 
28 | # build qmcpack
29 | mkdir build_crusher_afar_offload_cuda2hip_real_MP
30 | cd build_crusher_afar_offload_cuda2hip_real_MP
31 | 
32 | cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_MIXED_PRECISION=ON \
33 |       -DENABLE_OFFLOAD=ON -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON -DHIP_ARCH=gfx90a \
34 |       -DMPIEXEC_EXECUTABLE=`which srun` -DQMC_DATA=$WORKSPACE/QMCDATA ../qmcpack
35 | 
36 | # grab a node
37 | salloc -A MAT189 -t 00:60:00 -N 1
38 | 
39 | # hdf5 workaround
40 | export HDF5_USE_FILE_LOCKING=FALSE
41 | 
42 | # run all the deterministic tests
43 | ctest -R deter -j32
44 | 
45 | # run performance tests. test files are inside the build directory
46 | # please run each of them as an individual test because test may hang due to bugs in AMD runtime.
47 | 
48 | cd tests/performance/NiO/dmc-a4-e48-DU8-batched_driver
49 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S1-dmc.xml
50 | cd -
51 | 
52 | cd tests/performance/NiO/dmc-a8-e96-DU16-batched_driver
53 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S2-dmc.xml
54 | cd -
55 | 
56 | cd tests/performance/NiO/dmc-a16-e192-DU16-batched_driver
57 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S4-dmc.xml
58 | cd -
59 | 
60 | cd tests/performance/NiO/dmc-a32-e384-DU32-batched_driver
61 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S8-dmc.xml
62 | cd -
63 | 
64 | cd tests/performance/NiO/dmc-a64-e768-DU32-batched_driver
65 | srun --gpus-per-task 1 -c 4 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S16-dmc.xml
66 | cd -
67 | 
68 | cd tests/performance/NiO/dmc-a512-e6144-DU64-cpu_driver
69 | srun --gpus-per-task 1 -c 16 --gpu-bind=closest ../../../../bin/qmcpack NiO-fcc-S128-dmc.xml
70 | cd -
71 | 


--------------------------------------------------------------------------------
/integration/crusher_recipe/modules/cray-mpich-afar.lua:
--------------------------------------------------------------------------------
 1 | --[[
 2 | 
 3 |     file cray-mpich-llvm module
 4 |     This is not from Cray.
 5 | ]]--
 6 | 
 7 | conflict("mpiwrappers")
 8 | unload("PrgEnv-cray")
 9 | unload("PrgEnv-gnu")
10 | unload("PrgEnv-amd")
11 | unload("PrgEnv-cray-amd")
12 | 
13 | load("gcc")
14 | load("afar")
15 | load("craype")
16 | load("cray-mpich")
17 | load("cray-pals")
18 | 
19 | prepend_path("PATH", pathJoin(os.getenv("MPICH_DIR"), "bin"), ":")
20 | 
21 | setenv("MPICH_CC", "clang")
22 | setenv("MPICH_CXX", "clang++")
23 | setenv("MPICH_FC", "flang")
24 | setenv("MPICH_F77", "flang")
25 | setenv("MPICH_F90", "flang")
26 | 
27 | setenv("ROCM_PATH", pathJoin(os.getenv("OLCF_AFAR_ROOT")))
28 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TESTS_BINDIR ${CMAKE_CURRENT_BINARY_DIR}/bin)
 2 | 
 3 | add_subdirectory(allocator)
 4 | add_subdirectory(complex)
 5 | add_subdirectory(global_variable)
 6 | add_subdirectory(linking)
 7 | add_subdirectory(math)
 8 | add_subdirectory(private)
 9 | add_subdirectory(target_task)
10 | add_subdirectory(omphost)
11 | add_subdirectory(reduction)
12 | add_subdirectory(implict_async)
13 | add_subdirectory(map)
14 | 
15 | if (ENABLE_Fortran)
16 |   add_subdirectory(fortran_use_device_ptr)
17 |   add_subdirectory(fortran_allocator)
18 | endif()
19 | 


--------------------------------------------------------------------------------
/tests/allocator/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX AND ENABLE_EXPERIMENTAL)
 2 |   set(FULLNAME omp_pteam_mem_alloc)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 5 |   set_target_properties(${EXE_NAME} PROPERTIES
 6 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 7 |   add_test(NAME ${EXE_NAME}
 8 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
 9 |            WORKING_DIRECTORY ${TESTS_BINDIR})
10 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/tests/allocator/omp_pteam_mem_alloc.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <omp.h>
 3 | #define NTEAM 32
 4 | #define TEAM_SIZE 128
 5 | 
 6 | template<class T>
 7 | void compute_prefactor(int team_id, T base[2])
 8 | {
 9 |   base[0] = team_id;
10 |   base[1] = team_id * 2;
11 | }
12 | 
13 | bool failed = false;
14 | 
15 | template<class T>
16 | void test_omp_pteam_mem_alloc()
17 | {
18 |   T sum[NTEAM];
19 |   #pragma omp target teams distribute map(from:sum[:NTEAM])
20 |   for(int team_id = 0; team_id < NTEAM; team_id++)
21 |   {
22 |     T local_sum = 0;
23 |     T base[2];
24 |     #pragma omp allocate(base) allocator(omp_pteam_mem_alloc)
25 |     compute_prefactor(team_id, base);
26 |     #pragma omp parallel for reduction(+: local_sum)
27 |     for(int tid = 0; tid < TEAM_SIZE; tid++)
28 |       local_sum += base[0] + tid;
29 |     sum[team_id] = local_sum;
30 |   }
31 |   for(int team_id = 0; team_id < NTEAM; team_id++)
32 |     if (sum[team_id] != team_id * TEAM_SIZE + (TEAM_SIZE -1) * TEAM_SIZE / 2 )
33 |     {
34 |       std::cout << "sum[" << team_id << "] = " << sum[team_id] << " ref " << team_id * TEAM_SIZE + (TEAM_SIZE -1) * TEAM_SIZE / 2 << std::endl;
35 |       failed = true;
36 |     }
37 | }
38 | 
39 | int main()
40 | {
41 |   test_omp_pteam_mem_alloc<int>();
42 |   return failed;
43 | }
44 | 


--------------------------------------------------------------------------------
/tests/complex/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   foreach(NAME complex_reduction_cpu complex_reduction complex)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 
15 | if (ENABLE_Fortran)
16 |   set(FULLNAME complex)
17 |   set(EXE_NAME f.${FULLNAME})
18 |   add_executable(${EXE_NAME} ${FULLNAME}.f90)
19 |   set_target_properties(${EXE_NAME} PROPERTIES
20 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
21 |   add_test(NAME ${EXE_NAME}
22 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
23 |            WORKING_DIRECTORY ${TESTS_BINDIR})
24 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran)
25 | endif()
26 | 


--------------------------------------------------------------------------------
/tests/complex/complex.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <complex>
  3 | 
  4 | bool failed = false;
  5 | 
  6 | template<typename T>
  7 | void test_map()
  8 | {
  9 |   std::complex<T> a(0.2, 1), a_check;
 10 |   #pragma omp target map(from:a_check)
 11 |   {
 12 |     a_check = a;
 13 |   }
 14 | 
 15 |   if (std::abs(a - a_check) > 1e-6)
 16 |   {
 17 |     std::cout << "wrong map value check" << a_check << " correct value " << a << std::endl;
 18 |     failed = true;
 19 |   }
 20 | }
 21 | 
 22 | template<typename RT, typename AT, typename BT>
 23 | void test_plus(AT a, BT b)
 24 | {
 25 |   std::complex<RT> c, c_host;
 26 | 
 27 |   c_host = a + b;
 28 |   #pragma omp target map(from:c)
 29 |   {
 30 |     c = a + b;
 31 |   }
 32 | 
 33 |   if (std::abs(c - c_host) > 1e-6)
 34 |   {
 35 |     std::cout << "wrong operator + value check" << c << " correct value " << c_host << std::endl;
 36 |     failed = true;
 37 |   }
 38 | }
 39 | 
 40 | template<typename RT, typename AT, typename BT>
 41 | void test_minus(AT a, BT b)
 42 | {
 43 |   std::complex<RT> c, c_host;
 44 | 
 45 |   c_host = a - b;
 46 |   #pragma omp target map(from:c)
 47 |   {
 48 |     c = a - b;
 49 |   }
 50 | 
 51 |   if (std::abs(c - c_host) > 1e-6)
 52 |   {
 53 |     std::cout << "wrong operator - value check" << c << " correct value " << c_host << std::endl;
 54 |     failed = true;
 55 |   }
 56 | }
 57 | 
 58 | template<typename RT, typename AT, typename BT>
 59 | void test_mul(AT a, BT b)
 60 | {
 61 |   std::complex<RT> c, c_host;
 62 | 
 63 |   c_host = a * b;
 64 |   #pragma omp target map(from:c)
 65 |   {
 66 |     c = a * b;
 67 |   }
 68 | 
 69 |   if (std::abs(c - c_host) > 1e-6)
 70 |   {
 71 |     std::cout << "wrong operator * value check" << c << " correct value " << c_host << std::endl;
 72 |     failed = true;
 73 |   }
 74 | }
 75 | 
 76 | template<typename RT, typename AT, typename BT>
 77 | void test_div(AT a, BT b)
 78 | {
 79 |   std::complex<RT> c, c_host;
 80 | 
 81 |   c_host = a / b;
 82 |   #pragma omp target map(from:c)
 83 |   {
 84 |     c = a / b;
 85 |   }
 86 | 
 87 |   if (std::abs(c - c_host) > 1e-6)
 88 |   {
 89 |     std::cout << "wrong operator / value check" << c << " correct value " << c_host << std::endl;
 90 |     failed = true;
 91 |   }
 92 | }
 93 | 
 94 | template<typename T>
 95 | void test_complex()
 96 | {
 97 |   test_map<T>();
 98 | 
 99 |   test_plus<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
100 |   test_plus<T>(std::complex<T>(0, 1), T(0.5));
101 |   test_plus<T>(T(0.5), std::complex<T>(0, 1));
102 | 
103 |   test_minus<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
104 |   test_minus<T>(std::complex<T>(0, 1), T(0.5));
105 |   test_minus<T>(T(0.5), std::complex<T>(0, 1));
106 | 
107 |   test_mul<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
108 |   test_mul<T>(std::complex<T>(0, 1), T(0.5));
109 |   test_mul<T>(T(0.5), std::complex<T>(0, 1));
110 | 
111 |   test_div<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
112 |   test_div<T>(std::complex<T>(0, 1), T(0.5));
113 |   test_div<T>(T(0.5), std::complex<T>(0, 1));
114 | }
115 | 
116 | int main()
117 | {
118 |   std::cout << "Testing float" << std::endl;
119 |   test_complex<float>();
120 |   std::cout << "Testing double" << std::endl;
121 |   test_complex<double>();
122 |   return failed;
123 | }
124 | 


--------------------------------------------------------------------------------
/tests/complex/complex.f90:
--------------------------------------------------------------------------------
 1 | program test_complex
 2 |     implicit none
 3 |     complex :: a, b, c
 4 | 
 5 |     a = cmplx(0, 1)
 6 |     b = cmplx(0.5, 0.3)
 7 |     !$omp target map(from: c)
 8 |     c = a*b
 9 |     !$omp end target
10 | 
11 |     if (abs(c - a*b) > 1e-7) then
12 |       print *, "wrong value ", c, "right value ", a*b
13 |       stop 1
14 |     endif
15 | end program test_complex
16 | 


--------------------------------------------------------------------------------
/tests/complex/complex_reduction.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <complex>
  3 | 
  4 | bool failed = false;
  5 | 
  6 | template<typename T>
  7 | void test_map()
  8 | {
  9 |   std::complex<T> a(0.2, 1), a_check;
 10 | #pragma omp target map(from : a_check)
 11 |   {
 12 |     a_check = a;
 13 |   }
 14 | 
 15 |   if (std::abs(a - a_check) > 1e-6)
 16 |   {
 17 |     std::cout << "wrong map value check" << a_check << " correct value " << a << std::endl;
 18 |     failed = true;
 19 |   }
 20 | }
 21 | 
 22 | #if !defined(__NO_UDR)
 23 | #pragma omp declare reduction(+ : std::complex <float> : omp_out += omp_in)
 24 | #pragma omp declare reduction(+ : std::complex <double> : omp_out += omp_in)
 25 | #endif
 26 | 
 27 | template<typename T>
 28 | class initiator
 29 | {
 30 | public:
 31 |   static T value(int i) { return T(i); }
 32 | };
 33 | 
 34 | template<typename T>
 35 | class initiator<std::complex<T>>
 36 | {
 37 | public:
 38 |   static std::complex<T> value(int i) { return {T(i), T(-i)}; }
 39 | };
 40 | 
 41 | template<typename T>
 42 | void test_reduction()
 43 | {
 44 |   T sum(0), sum_host(0);
 45 |   const int size = 100;
 46 |   T array[size];
 47 |   for (int i = 0; i < size; i++)
 48 |   {
 49 |     array[i] = initiator<T>::value(i);
 50 |     sum_host += array[i];
 51 |   }
 52 | 
 53 | #pragma omp target teams distribute parallel for map(to : array[:size]) reduction(+ : sum)
 54 |   for (int i = 0; i < size; i++)
 55 |     sum += array[i];
 56 | 
 57 |   if (std::abs(sum - sum_host) > 1e-6)
 58 |   {
 59 |     std::cout << "wrong reduction value check" << sum << " correct value " << sum_host << std::endl;
 60 |     failed = true;
 61 |   }
 62 | 
 63 |   const int nblock(10), block_size(10);
 64 |   T block_sum[nblock];
 65 | #pragma omp target teams distribute map(to : array[:size]) map(from : block_sum[:nblock])
 66 |   for (int ib = 0; ib < nblock; ib++)
 67 |   {
 68 |     T partial_sum(0);
 69 |     const int istart = ib * block_size;
 70 |     const int iend   = (ib + 1) * block_size;
 71 | #pragma omp parallel for reduction(+ : partial_sum)
 72 |     for (int i = istart; i < iend; i++)
 73 |       partial_sum += array[i];
 74 |     block_sum[ib] = partial_sum;
 75 |   }
 76 | 
 77 |   sum = 0;
 78 |   for (int ib = 0; ib < nblock; ib++)
 79 |     sum += block_sum[ib];
 80 |   if (std::abs(sum - sum_host) > 1e-6)
 81 |   {
 82 |     std::cout << "hierarchical parallelism wrong reduction value check" << sum << " correct value " << sum_host
 83 |               << std::endl;
 84 |     failed = true;
 85 |   }
 86 | }
 87 | 
 88 | template<typename T>
 89 | void test_real()
 90 | {
 91 |   test_reduction<T>();
 92 | }
 93 | 
 94 | template<typename T>
 95 | void test_complex()
 96 | {
 97 |   test_map<T>();
 98 |   test_reduction<std::complex<T>>();
 99 | }
100 | 
101 | int main()
102 | {
103 |   std::cout << "Testing real" << std::endl;
104 |   std::cout << "Testing float" << std::endl;
105 |   test_real<float>();
106 |   std::cout << "Testing double" << std::endl;
107 |   test_real<double>();
108 | 
109 |   std::cout << "Testing complex" << std::endl;
110 |   std::cout << "Testing float" << std::endl;
111 |   test_complex<float>();
112 |   std::cout << "Testing double" << std::endl;
113 |   test_complex<double>();
114 |   return failed;
115 | }
116 | 


--------------------------------------------------------------------------------
/tests/complex/complex_reduction_cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <complex>
 3 | 
 4 | bool failed = false;
 5 | 
 6 | #if !defined(__NO_UDR)
 7 | #pragma omp declare reduction(+: std::complex<float>: omp_out += omp_in)
 8 | #pragma omp declare reduction(+: std::complex<double>: omp_out += omp_in)
 9 | #endif
10 | 
11 | template<typename T>
12 | void test_reduction()
13 | {
14 |   T sum(0), sum_host(0);
15 |   const int size = 100;
16 |   T array[size];
17 |   for (int i = 0; i < size; i++)
18 |   {
19 |     array[i] = T(i);
20 |     sum_host += array[i];
21 |   }
22 | 
23 |   #pragma omp parallel for reduction(+: sum)
24 |   for (int i = 0; i < size; i++)
25 |     sum += array[i];
26 | 
27 |   if (std::abs(sum - sum_host) > 1e-6)
28 |   {
29 |     std::cout << "wrong reduction value check" << sum << " correct value " << sum_host << std::endl;
30 |     failed = true;
31 |   }
32 | }
33 | 
34 | int main()
35 | {
36 |   test_reduction<float>();
37 |   test_reduction<std::complex<float>>();
38 |   test_reduction<double>();
39 |   test_reduction<std::complex<double>>();
40 |   return failed;
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/cudafor_omp/README.md:
--------------------------------------------------------------------------------
 1 | Case 1
 2 | ```
 3 | nvfortran -cuda noomp.f90
 4 | ./a.out
 5 | ```
 6 | good
 7 | 
 8 | Case 2
 9 | ```
10 | nvfortran -cuda -mp=gpu noomp.f90
11 | ./a.out
12 | ```
13 | SegFault
14 | 
15 | Case 3
16 | ```
17 | nvfortran -cuda -mp=gpu omp_below.f90
18 | ```
19 | Compiler error
20 | ```
21 | NVFORTRAN-S-1050-Non-tightly nested loop in cuf kernels do at nest 1 (omp_below.f90: 18)
22 | NVFORTRAN-S-0155-Kernel region ignored; no parallel loops  (omp_below.f90: 17)
23 |   0 inform,   0 warnings,   2 severes, 0 fatal for repro
24 | NVFORTRAN-S-1058-Call to PGI runtime function not supported - pgf90_dev_copyout (omp_below.f90: 28)
25 | ```
26 | 
27 | Case 4
28 | ```
29 | nvfortran -cuda -mp=gpu omp_above.f90
30 | ```
31 | Compiler error
32 | ```
33 | NVFORTRAN-S-1058-Call to PGI runtime function not supported - pgf90_dev_copyout (omp_above.f90: 20)
34 | ```
35 | 


--------------------------------------------------------------------------------
/tests/cudafor_omp/noomp.f90:
--------------------------------------------------------------------------------
 1 | module test
 2 |   implicit none
 3 |   contains
 4 |     SUBROUTINE repro()
 5 |       use cudafor
 6 |       IMPLICIT NONE
 7 |       integer, device, allocatable :: nh_d2(:),ityp_d2(:)
 8 |       integer :: na,i
 9 |       integer :: nat,npw,ierr
10 |       integer :: np,nh_np,ih
11 |       nat = 1
12 |       npw = 1050
13 |       allocate(ityp_d2(1:nat))
14 |       ityp_d2 = 5
15 |       allocate(nh_d2(1:1000))
16 |       nh_d2 = 3
17 |       !$cuf kernel do(2) <<<*,*>>>
18 |       DO na =1, nat
19 |          DO i = 1, npw
20 |             np = ityp_d2(na)
21 |             nh_np = nh_d2(np)
22 |          ENDDO
23 |       ENDDO
24 | 
25 |       deallocate(ityp_d2,nh_d2)
26 |       !
27 |     END SUBROUTINE repro
28 | end module test
29 | 
30 | program main
31 |    use test
32 |    implicit none
33 |    call repro
34 | end program
35 | 


--------------------------------------------------------------------------------
/tests/cudafor_omp/omp_above.f90:
--------------------------------------------------------------------------------
 1 | module test
 2 |   implicit none
 3 |   contains
 4 |     SUBROUTINE repro()
 5 |       use cudafor
 6 |       IMPLICIT NONE
 7 |       integer, device, allocatable :: nh_d2(:),ityp_d2(:)
 8 |       integer :: na,i
 9 |       integer :: nat,npw,ierr
10 |       integer :: np,nh_np,ih
11 |       nat = 1
12 |       npw = 1050
13 |       allocate(ityp_d2(1:nat))
14 |       ityp_d2 = 5
15 |       allocate(nh_d2(1:1000))
16 |       nh_d2 = 3
17 |       !$omp target teams distribute parallel do collapse(2)
18 |       DO na =1, nat
19 |          DO i = 1, npw
20 |             np = ityp_d2(na)
21 |             nh_np = nh_d2(np)
22 |          ENDDO
23 |       ENDDO
24 | 
25 |       !$cuf kernel do(2) <<<*,*>>>
26 |       DO na =1, nat
27 |          DO i = 1, npw
28 |             np = ityp_d2(na)
29 |             nh_np = nh_d2(np)
30 |          ENDDO
31 |       ENDDO
32 | 
33 |       deallocate(ityp_d2,nh_d2)
34 |       !
35 |     END SUBROUTINE repro
36 | end module test
37 | 
38 | program main
39 |    use test
40 |    implicit none
41 |    call repro
42 | end program
43 | 


--------------------------------------------------------------------------------
/tests/cudafor_omp/omp_below.f90:
--------------------------------------------------------------------------------
 1 | module test
 2 |   implicit none
 3 |   contains
 4 |     SUBROUTINE repro()
 5 |       use cudafor
 6 |       IMPLICIT NONE
 7 |       integer, device, allocatable :: nh_d2(:),ityp_d2(:)
 8 |       integer :: na,i
 9 |       integer :: nat,npw,ierr
10 |       integer :: np,nh_np,ih
11 |       nat = 1
12 |       npw = 1050
13 |       allocate(ityp_d2(1:nat))
14 |       ityp_d2 = 5
15 |       allocate(nh_d2(1:1000))
16 |       nh_d2 = 3
17 |       !$cuf kernel do(2) <<<*,*>>>
18 |       DO na =1, nat
19 |          DO i = 1, npw
20 |             np = ityp_d2(na)
21 |             nh_np = nh_d2(np)
22 |          ENDDO
23 |       ENDDO
24 | 
25 |       !$omp target teams distribute parallel do collapse(2)
26 |       DO na =1, nat
27 |          DO i = 1, npw
28 |             np = ityp_d2(na)
29 |             nh_np = nh_d2(np)
30 |          ENDDO
31 |       ENDDO
32 | 
33 |       deallocate(ityp_d2,nh_d2)
34 |       !
35 |     END SUBROUTINE repro
36 | end module test
37 | 
38 | program main
39 |    use test
40 |    implicit none
41 |    call repro
42 | end program
43 | 


--------------------------------------------------------------------------------
/tests/fortran_allocator/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(array_class dualspace.f90)
 2 | 
 3 | foreach(NAME device device_isptr resize)
 4 |   set(FULLNAME dualspace_array_${NAME})
 5 |   set(EXE_NAME f.${FULLNAME})
 6 |   add_executable(${EXE_NAME} ${FULLNAME}.f90)
 7 |   target_link_libraries(${EXE_NAME} PUBLIC array_class)
 8 |   set_target_properties(${EXE_NAME} PROPERTIES
 9 |                        RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
10 |   add_test(NAME ${EXE_NAME}
11 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
12 |            WORKING_DIRECTORY ${TESTS_BINDIR})
13 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran)
14 | endforeach()
15 | 


--------------------------------------------------------------------------------
/tests/fortran_allocator/dualspace.f90:
--------------------------------------------------------------------------------
 1 | module dualspace
 2 |   use iso_c_binding
 3 |   use iso_fortran_env
 4 |   implicit none
 5 | 
 6 |   type dualspace_base_type
 7 |     character, dimension(:), pointer :: data_f => NULL ()
 8 |     integer(int64) :: data_length = 0
 9 |     type(c_ptr) :: data_ptr = c_null_ptr
10 |     contains
11 |       procedure :: resize_base
12 |       final :: deallocate_data
13 |   end type
14 | 
15 |   type, extends(dualspace_base_type) :: dualspace_double_type
16 |     integer :: dims = 0
17 |   contains
18 |     procedure :: resize => resize_double
19 |     procedure :: data => data_double
20 |   end type
21 | 
22 |   type, extends(dualspace_base_type) :: dualspace_cplx_double_type
23 |     integer :: dims = 0
24 |   contains
25 |     procedure :: resize => resize_cplx_double
26 |     procedure :: data => data_cplx_double
27 |   end type
28 | 
29 |   private :: resize_base, allocate_data, deallocate_data
30 |   private :: resize_double, resize_cplx_double
31 |   private :: data_double, data_cplx_double
32 | contains
33 |   subroutine resize_base(self, bytes)
34 |     implicit none
35 |     class(dualspace_base_type), intent(inout) :: self
36 |     integer(int64), intent(in) :: bytes
37 | 
38 |     if (self%data_length .ne. bytes) then
39 |       call deallocate_data(self)
40 |       call allocate_data(self, bytes)
41 |     endif
42 | 
43 |     self%data_ptr = C_LOC(self%data_f)
44 |   end subroutine
45 | 
46 |   subroutine allocate_data(self, bytes)
47 |     implicit none
48 |     type(dualspace_base_type), intent(inout) :: self
49 |     integer(int64), intent(in) :: bytes
50 |     if (bytes > 0) then
51 |       allocate(self%data_f(bytes))
52 |       !$omp target enter data map(alloc: self%data_f)
53 |       write(*,*) "allocate_data size ", bytes
54 |     endif
55 |     self%data_length = bytes
56 |   end subroutine
57 | 
58 |   subroutine deallocate_data(self)
59 |     implicit none
60 |     type(dualspace_base_type), intent(inout) :: self
61 |     if (self%data_length > 0) then
62 |       write(*,*) "deallocate_data size ", self%data_length
63 |       !$omp target exit data map(delete: self%data_f)
64 |       deallocate(self%data_f)
65 |     endif
66 |     self%data_length = 0
67 |   end subroutine
68 | 
69 |   subroutine resize_double(self, num)
70 |     class(dualspace_double_type), intent(inout) :: self
71 |     integer, intent(in) :: num
72 |     real(real64) :: dummy
73 |     call self%resize_base(num * sizeof(dummy))
74 |     self%dims = num
75 |   end subroutine
76 | 
77 |   subroutine resize_cplx_double(self, num)
78 |     class(dualspace_cplx_double_type), intent(inout) :: self
79 |     integer, intent(in) :: num
80 |     complex(real64) :: dummy
81 |     call self%resize_base(num * sizeof(dummy))
82 |     self%dims = num
83 |   end subroutine
84 | 
85 |   function data_double(self) result(res)
86 |     class(dualspace_double_type), intent(inout) :: self
87 |     real(real64), dimension(:), pointer :: res
88 | 
89 |     call C_F_POINTER(self%data_ptr, res, shape=[self%dims])
90 |   end function
91 | 
92 |   function data_cplx_double(self) result(res)
93 |     class(dualspace_cplx_double_type), intent(inout) :: self
94 |     complex(real64), dimension(:), pointer :: res
95 | 
96 |     call C_F_POINTER(self%data_ptr, res, shape=[self%dims])
97 |   end function
98 | end module
99 | 


--------------------------------------------------------------------------------
/tests/fortran_allocator/dualspace_array_device.f90:
--------------------------------------------------------------------------------
 1 | subroutine test
 2 | use dualspace
 3 | implicit none
 4 | type(dualspace_double_type) :: abc
 5 | real(8), dimension(:), pointer :: abc_data 
 6 | integer, parameter :: Ntotal = 1000
 7 | integer :: i, Nsum
 8 | 
 9 | call abc%resize(Ntotal)
10 | 
11 | ! initialize values
12 | abc_data => abc%data()
13 | !$omp target teams distribute parallel do map(always, from:abc_data)
14 | do i = 1, Ntotal
15 |   abc_data(i) = i
16 | enddo
17 | 
18 | ! do a sum
19 | Nsum = 0
20 | !$omp target teams distribute parallel do reduction(+: Nsum)
21 | do i = 1, Ntotal
22 |   Nsum = Nsum + abc_data(i)
23 | enddo
24 | 
25 | write(*,*) "Nsum = ", Nsum
26 | 
27 | if (Nsum /= 500500) stop 1
28 | !write(*,*) "end of subroutine"
29 | end subroutine test
30 | 
31 | program main
32 | call test()
33 | !write(*,*) "end of program"
34 | end program main
35 | 


--------------------------------------------------------------------------------
/tests/fortran_allocator/dualspace_array_device_isptr.f90:
--------------------------------------------------------------------------------
 1 | subroutine sum_on_device(array, array_size)
 2 |   implicit none
 3 |   integer, intent(in) :: array_size
 4 |   real(kind = 8), intent(in) :: array(1:array_size)
 5 |   integer :: i, Nsum
 6 |  
 7 | ! do a sum
 8 | Nsum = 0
 9 | !$omp target teams distribute parallel do reduction(+: Nsum) has_device_addr(array)
10 | do i = 1, array_size
11 |   Nsum = Nsum + array(i)
12 | enddo
13 | 
14 | write(*,*) "Nsum = ", Nsum
15 | 
16 | if (Nsum /= 500500) stop 1
17 | end subroutine
18 | 
19 | subroutine test
20 | use dualspace
21 | implicit none
22 | type(dualspace_double_type) :: abc
23 | real(8), dimension(:), pointer :: abc_data
24 | integer, parameter :: Ntotal = 1000
25 | integer :: i
26 | 
27 | call abc%resize(Ntotal)
28 | abc_data => abc%data()
29 | 
30 | ! initialize values
31 | !$omp target teams distribute parallel do map(always, from:abc_data)
32 | do i = 1, Ntotal
33 |   abc_data(i) = i
34 | enddo
35 | 
36 | !$omp target data use_device_addr(abc_data)
37 | call sum_on_device(abc_data, size(abc_data))
38 | !$omp end target data
39 | 
40 | !write(*,*) "end of subroutine"
41 | end subroutine test
42 | 
43 | program main
44 | call test()
45 | !write(*,*) "end of program"
46 | end program main
47 | 


--------------------------------------------------------------------------------
/tests/fortran_allocator/dualspace_array_resize.f90:
--------------------------------------------------------------------------------
 1 | subroutine fill_density(density)
 2 |   use dualspace, only: dualspace_double_type, dualspace_cplx_double_type
 3 |   implicit none
 4 |   type(dualspace_double_type), intent(inout) :: density
 5 |   real(8), dimension(:), pointer :: density_data
 6 |   integer, parameter :: test_size =20
 7 |   integer :: i
 8 | 
 9 |   call density%resize(test_size)
10 |   density_data => density%data()
11 |   write(*,*) "density_data size ", size(density_data)
12 |   do i = 1, test_size
13 |     density_data(i) = 1.
14 |   enddo
15 |   !$omp target update to(density_data)
16 |   !$omp target teams distribute parallel do map(always, from: density_data)
17 |   do i = 1, test_size
18 |     density_data(i) = density_data(i) + i * 1.
19 |   enddo
20 | 
21 |   if (density_data(3).ne.4.) stop 1
22 | end subroutine
23 | 
24 | program abc
25 |   use dualspace, only: dualspace_double_type, dualspace_cplx_double_type
26 |   implicit none
27 | 
28 |   type(dualspace_double_type) :: density, density2
29 |   real(8), dimension(:), pointer :: density_data
30 |   call density%resize(10)
31 |   density_data => density%data()
32 |   write(*,*) "density_data size ", size(density_data)
33 | 
34 |   call fill_density(density2)
35 |   density_data => density2%data()
36 |   write(*,*) "density_data(3) should be 3. Current value ", density_data(3)
37 | 
38 |   block
39 |     type(dualspace_cplx_double_type) :: wf
40 |     complex(8), dimension(:), pointer :: wf_data
41 |     call wf%resize(30)
42 |     wf_data =>  wf%data()
43 |     write(*,*) "wf_data size ", size(wf_data)
44 |   end block
45 | end program
46 | 


--------------------------------------------------------------------------------
/tests/fortran_use_device_ptr/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(FULLNAME use_device_ptr_target)
 2 | set(EXE_NAME f.${FULLNAME})
 3 | add_executable(${EXE_NAME} ${FULLNAME}.f90)
 4 | set_target_properties(${EXE_NAME} PROPERTIES
 5 |                      RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 6 | add_test(NAME ${EXE_NAME}
 7 |          COMMAND $<TARGET_NAME:${EXE_NAME}>
 8 |          WORKING_DIRECTORY ${TESTS_BINDIR})
 9 | set_tests_properties(${EXE_NAME} PROPERTIES
10 |                      LABELS fortran)
11 | 


--------------------------------------------------------------------------------
/tests/fortran_use_device_ptr/use_device_ptr_target.f90:
--------------------------------------------------------------------------------
 1 | program test_use_device_ptr
 2 |   implicit none
 3 |   double precision :: alpha
 4 |   integer, parameter :: lda = 10
 5 |   double precision, allocatable :: mat(:, :)
 6 | 
 7 |   allocate(mat(lda, lda))
 8 |   call dgemm(lda, mat)
 9 | 
10 |   contains
11 |     subroutine dgemm(lda, a)
12 |       implicit none
13 |       integer :: lda
14 |       double precision, target:: a(lda,lda) ! need target attribute to use c_loc
15 |       !$omp target data use_device_addr(a) map(a)
16 |         !call cublas_dgemm('T','N',M,N,K,alpha,c_loc(A),LDA,c_loc(b) +,LDB,beta,c_loc(c),LDC)
17 |       !$omp end target data
18 |     end subroutine
19 | end program
20 | 


--------------------------------------------------------------------------------
/tests/global_variable/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(global_static)
2 | add_subdirectory(constexpr)
3 | add_subdirectory(global_pointer)
4 | 


--------------------------------------------------------------------------------
/tests/global_variable/constexpr/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   set(FULLNAME constexpr)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 5 |   set_target_properties(${EXE_NAME} PROPERTIES
 6 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 7 |   add_test(NAME ${EXE_NAME}
 8 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
 9 |            WORKING_DIRECTORY ${TESTS_BINDIR})
10 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/tests/global_variable/constexpr/constexpr.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #define LENGTH 2
 3 | int main()
 4 | {
 5 |   constexpr double h_chebyshev_coefs[LENGTH] = { 0, 2.1 };
 6 | #pragma omp target enter data map(to:h_chebyshev_coefs[0:LENGTH])
 7 |   #pragma omp target
 8 |   {
 9 |     printf("print in target %lf %lf\n", h_chebyshev_coefs[0], h_chebyshev_coefs[1]);
10 |   }
11 |   return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_pointer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX AND CXX_OFFLOAD_RUNTIME_OKAY)
 2 |   set(FULLNAME global_pointer)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_executable(${EXE_NAME} main.cpp global.cpp)
 5 |   set_target_properties(${EXE_NAME} PROPERTIES
 6 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 7 |   add_test(NAME ${EXE_NAME}
 8 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
 9 |            WORKING_DIRECTORY ${TESTS_BINDIR})
10 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_pointer/Makefile:
--------------------------------------------------------------------------------
 1 | MPILER = amd
 2 | 
 3 | ifeq ($(COMPILER),intel)
 4 |   CC = icpx
 5 |   CFLAGS = -Wall -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__
 6 | endif
 7 | 
 8 | ifeq ($(COMPILER),ibm)
 9 |   CC = xlc_r
10 |   CFLAGS = -Wall -qsmp=omp -qoffload
11 | endif
12 | 
13 | ifeq ($(COMPILER),llvm)
14 |   CC = clang++
15 |   CFLAGS = -Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 -g
16 | endif
17 | 
18 | ifeq ($(COMPILER),amd)
19 |   CC = clang++
20 |   CFLAGS = -Wall -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906
21 | endif
22 | 
23 | program = test
24 | 
25 | source = main.cpp global.cpp
26 | 
27 | obj = $(source:.cpp=.o)
28 | 
29 | deps = Makefile
30 | 
31 | $(program): $(obj) $(deps)
32 | 	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
33 | 
34 | %.o: %.cpp $(deps)
35 | 	$(CC) $(CFLAGS) -c $< -o $@
36 | 
37 | clean:
38 | 	rm -rf $(program) $(obj)
39 | 
40 | edit:
41 | 	vim -p $(source) $(deps)
42 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_pointer/global.cpp:
--------------------------------------------------------------------------------
1 | #include "global.h"
2 | 
3 | #pragma omp declare target
4 | int * device_arr;
5 | #pragma omp end declare target
6 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_pointer/global.h:
--------------------------------------------------------------------------------
1 | #ifndef GLOBAL_H
2 | #define GLOBAL_H
3 | 
4 | #pragma omp declare target
5 | extern int * device_arr;
6 | #pragma omp end declare target
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_pointer/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <omp.h>
 4 | #include "global.h"
 5 | 
 6 | void foo(int i)
 7 | {
 8 |   device_arr[i] *= 2;
 9 | }
10 | 
11 | int main(void)
12 | {
13 |   int host_id   = omp_get_initial_device();
14 |   int device_id = omp_get_default_device();
15 |   
16 |   int N = 5;
17 | 
18 |   // Allocate and initialize host array
19 |   size_t sz = N * sizeof(int);
20 |   int * host_arr = (int *) malloc(sz);
21 |   for( int i = 0; i < N; i++ )
22 |   {
23 |     host_arr[i] = i;
24 |   }
25 | 
26 |   // Allocate device array and copy data from host -> device
27 |   device_arr = (int *) omp_target_alloc(sz, device_id);
28 |   omp_target_memcpy(device_arr, host_arr, sz, 0, 0, device_id, host_id);
29 |   #pragma omp target update to(device_arr)
30 | 
31 |   // Execute device kernel
32 |   #pragma omp target teams distribute parallel for
33 |   for( int i = 0; i < N; i++)
34 |   {
35 |     foo(i);
36 |   }
37 |   
38 |   // Copy data from device -> host
39 |   omp_target_memcpy(host_arr, device_arr, sz, 0, 0, host_id, device_id);
40 | 
41 |   // Return non-zero error code if we failed
42 |   return host_arr[4] != 8;
43 | }
44 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_static/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   set(FULLNAME global_static)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_executable(${EXE_NAME} main.cpp data.cpp)
 5 |   set_target_properties(${EXE_NAME} PROPERTIES
 6 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 7 |   add_test(NAME ${EXE_NAME}
 8 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
 9 |            WORKING_DIRECTORY ${TESTS_BINDIR})
10 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_static/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=icpx
 2 | OFFLOAD_FLAGS=-fiopenmp -fopenmp-targets=spir64
 3 | 
 4 | .PHONY: clean
 5 | 
 6 | a.out: main.cpp data.o
 7 | 	${CXX} ${OFFLOAD_FLAGS} main.cpp data.o
 8 | data.o: data.cpp data.hpp
 9 | 	${CXX} ${OFFLOAD_FLAGS} -c data.cpp
10 | 
11 | clean:
12 | 	rm data.o a.out
13 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_static/data.cpp:
--------------------------------------------------------------------------------
 1 | #include "data.hpp"
 2 | 
 3 | #pragma omp declare target
 4 | template <>
 5 | const float engine<float>::params[4] = {1.0f, 2.0f, 3.0f, 4.0f};
 6 | 
 7 | template <>
 8 | const double engine<double>::params[4] = {1.0, 2.0, 3.0, 4.0};
 9 | #pragma omp end declare target
10 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_static/data.hpp:
--------------------------------------------------------------------------------
 1 | template<typename T>
 2 | class engine
 3 | {
 4 | public:
 5 |   static const T params[4];
 6 | };
 7 | 
 8 | template <>
 9 | const float engine<float>::params[4];
10 | 
11 | template <>
12 | const double engine<double>::params[4];
13 | 


--------------------------------------------------------------------------------
/tests/global_variable/global_static/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include "data.hpp"
 3 | 
 4 | template<typename T>
 5 | void check()
 6 | {
 7 |   T params_check[4];
 8 |   engine<T> mine;
 9 | 
10 |   #pragma omp target map(from:params_check[:4])
11 |   {
12 |     for(int i=0; i<4; i++)
13 |       params_check[i] = mine.params[i];
14 |   }
15 |   assert(params_check[0] == T(1.0));
16 |   assert(params_check[1] == T(2.0));
17 |   assert(params_check[2] == T(3.0));
18 |   assert(params_check[3] == T(4.0));
19 | }
20 | 
21 | int main()
22 | {
23 |   check<float>();
24 |   check<double>();
25 | }
26 | 


--------------------------------------------------------------------------------
/tests/implict_async/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 2 |   foreach(NAME llvm_alloc_host llvm_alloc_host_data)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 


--------------------------------------------------------------------------------
/tests/implict_async/llvm_alloc_host.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <iostream>
 3 | 
 4 | #ifndef __clang_major__
 5 | #error Need clang extension
 6 | #endif
 7 | 
 8 | // expose header free extensions
 9 | extern "C" {
10 | void *llvm_omp_target_alloc_host(size_t, int);
11 | void llvm_omp_target_free_host(void *, int);
12 | }
13 | 
14 | int main() {
15 |   const int N = 64;
16 |   const auto default_device = omp_get_default_device();
17 |   #pragma omp target device(default_device)
18 |   { int a = N; }
19 | 
20 |   int* hst_ptr = (int*) llvm_omp_target_alloc_host(N * sizeof(int), default_device);
21 | 
22 |   for (int i = 0; i < N; ++i)
23 |     hst_ptr[i] = 2;
24 | 
25 |   #pragma omp target teams distribute parallel for device(default_device) map(tofrom : hst_ptr[: N])
26 |   for (int i = 0; i < N; ++i)
27 |     hst_ptr[i] -= 1;
28 | 
29 |   int sum = 0;
30 |   for (int i = 0; i < N; ++i)
31 |     sum += hst_ptr[i];
32 | 
33 |   llvm_omp_target_free_host(hst_ptr, default_device);
34 | 
35 |   if (sum == N)
36 |     std::cout << "Correct Sum" << std::endl;
37 |   else
38 |     std::cout << "Wrong Sum " << sum << "! It should be " << N << std::endl;
39 | 
40 |   return 0;
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/tests/implict_async/llvm_alloc_host_data.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <iostream>
 3 | 
 4 | #ifndef __clang_major__
 5 | #error Need clang extension
 6 | #endif
 7 | 
 8 | // expose header free extensions
 9 | extern "C" {
10 | void *llvm_omp_target_alloc_host(size_t, int);
11 | void llvm_omp_target_free_host(void *, int);
12 | }
13 | 
14 | int main() {
15 |   const int N = 64;
16 |   const auto default_device = omp_get_default_device();
17 |   #pragma omp target device(default_device)
18 |   { int a = N; }
19 | 
20 |   int* hst_ptr = (int*) llvm_omp_target_alloc_host(N * sizeof(int), default_device);
21 |   #pragma omp target enter data device(default_device) map(alloc : hst_ptr[: N])
22 | 
23 |   for (int i = 0; i < N; ++i)
24 |     hst_ptr[i] = 2;
25 | 
26 |   #pragma omp target teams distribute parallel for device(default_device) map(always, tofrom : hst_ptr[: N])
27 |   for (int i = 0; i < N; ++i)
28 |     hst_ptr[i] -= 1;
29 | 
30 |   int sum = 0;
31 |   for (int i = 0; i < N; ++i)
32 |     sum += hst_ptr[i];
33 | 
34 |   #pragma omp target exit data device(default_device) map(delete : hst_ptr[: N])
35 |   llvm_omp_target_free_host(hst_ptr, default_device);
36 | 
37 |   if (sum == N)
38 |     std::cout << "Correct Sum" << std::endl;
39 |   else
40 |     std::cout << "Wrong Sum " << sum << "! It should be " << N << std::endl;
41 | 
42 |   return 0;
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/linking/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(link_static_fat_bin)
2 | add_subdirectory(linker_outlined_func)
3 | add_subdirectory(two_identical_templates)
4 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   set(FULLNAME link_static_fat_bin)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_library(mylib classA.cpp)
 5 |   add_executable(${EXE_NAME} main.cpp)
 6 |   target_link_libraries(${EXE_NAME} mylib)
 7 |   set_target_properties(${EXE_NAME} PROPERTIES
 8 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 9 |   add_test(NAME ${EXE_NAME}
10 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
11 |            WORKING_DIRECTORY ${TESTS_BINDIR})
12 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
13 | endif()
14 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/classA.cpp:
--------------------------------------------------------------------------------
 1 | #include "classA.h"
 2 | 
 3 | template<typename T>
 4 | void tester<T>::run()
 5 | {
 6 |   #pragma omp target
 7 |   {
 8 |     T a;
 9 |   }
10 | }
11 | 
12 | template class tester<double>;
13 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/classA.h:
--------------------------------------------------------------------------------
1 | template<typename T>
2 | class tester
3 | {
4 | public:
5 |   void run();
6 | };
7 | 
8 | extern template class tester<double>;
9 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/compile-amd.sh:
--------------------------------------------------------------------------------
1 | clang++ -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 -c classA.cpp
2 | rm -f libmy.a
3 | llvm-ar qc libmy.a classA.o
4 | llvm-ranlib libmy.a
5 | clang++ -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 main.cpp libmy.a
6 | ./a.out
7 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/compile-x86.sh:
--------------------------------------------------------------------------------
1 | clang++ -fopenmp -fopenmp-targets=x86_64-pc-linux-gnu -c classA.cpp
2 | rm -f libmy.a
3 | ar qc libmy.a classA.o
4 | ranlib libmy.a
5 | clang++ -fopenmp -fopenmp-targets=x86_64-pc-linux-gnu main.cpp -L. -lmy
6 | ./a.out
7 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/compile.sh:
--------------------------------------------------------------------------------
1 | clang++ -fopenmp -fopenmp-targets=nvptx64 -c classA.cpp
2 | rm -f libmy.a
3 | ar qc libmy.a classA.o
4 | ranlib libmy.a
5 | clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp libmy.a
6 | ./a.out
7 | 


--------------------------------------------------------------------------------
/tests/linking/link_static_fat_bin/main.cpp:
--------------------------------------------------------------------------------
1 | #include "classA.h"
2 | 
3 | int main()
4 | {
5 |   tester<double> A;
6 |   A.run();
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   set(FULLNAME linker_outlined_function_collision)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_executable(${EXE_NAME} main.cpp a.cpp b.cpp)
 5 |   set_target_properties(${EXE_NAME} PROPERTIES
 6 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 7 |   add_test(NAME ${EXE_NAME}
 8 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
 9 |            WORKING_DIRECTORY ${TESTS_BINDIR})
10 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/a.cpp:
--------------------------------------------------------------------------------
 1 | #include "ab.h"
 2 | #include "compute.h"
 3 | void a()
 4 | {
 5 |   const int N = 1000;
 6 |   #pragma omp target
 7 |   {
 8 |     float A[N];
 9 |     compute(A, N);
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/ab.h:
--------------------------------------------------------------------------------
1 | void a();
2 | void b();
3 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/b.cpp:
--------------------------------------------------------------------------------
 1 | #include "ab.h"
 2 | #include "compute.h"
 3 | void b()
 4 | {
 5 |   const int N = 1000;
 6 |   #pragma omp target
 7 |   {
 8 |     float A[N];
 9 |     compute(A, N);
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/compile.sh:
--------------------------------------------------------------------------------
1 | CXX=xlC_r
2 | CXX_FLAGS="-qsmp=omp -qoffload"
3 | 
4 | $CXX $CXX_FLAGS -c a.cpp
5 | $CXX $CXX_FLAGS -c b.cpp
6 | $CXX $CXX_FLAGS main.cpp a.o b.o
7 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/compute.h:
--------------------------------------------------------------------------------
1 | inline void compute(float* a, int size)
2 | {
3 |   #pragma omp parallel for
4 |   for (int i = 0; i < size; i++)
5 |     a[i] *= 2.0f;
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/linking/linker_outlined_func/main.cpp:
--------------------------------------------------------------------------------
1 | #include "ab.h"
2 | int main()
3 | {
4 |   a();
5 |   b();
6 |   return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/linking/missing_bundles/boo.cpp:
--------------------------------------------------------------------------------
1 | int boo()
2 | { return 0; }
3 | 


--------------------------------------------------------------------------------
/tests/linking/missing_bundles/compile.sh:
--------------------------------------------------------------------------------
1 | clang++ -fopenmp -fopenmp-targets=nvptx64 -c foo.cpp
2 | clang++ -fopenmp -c boo.cpp
3 | clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp boo.o foo.o 
4 | #nvlink fatal   : Could not open input file '/tmp/foo-7e0588.cubin'
5 | #clang-14: error: nvlink command failed with exit code 1 (use -v to see invocation)
6 | 


--------------------------------------------------------------------------------
/tests/linking/missing_bundles/foo.cpp:
--------------------------------------------------------------------------------
1 | int foo()
2 | { return 0; }
3 | 


--------------------------------------------------------------------------------
/tests/linking/missing_bundles/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | int foo();
 4 | int boo();
 5 | 
 6 | int main()
 7 | {
 8 |   foo();
 9 |   boo();
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/linking/two_identical_templates/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   set(FULLNAME linker_identical_template)
 3 |   set(EXE_NAME cxx.${FULLNAME})
 4 |   add_executable(${EXE_NAME} main.cpp test_a.cpp test_b.cpp)
 5 |   set_target_properties(${EXE_NAME} PROPERTIES
 6 |                         RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 7 |   add_test(NAME ${EXE_NAME}
 8 |            COMMAND $<TARGET_FILE:${EXE_NAME}>
 9 |            WORKING_DIRECTORY ${TESTS_BINDIR})
10 |   set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/tests/linking/two_identical_templates/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <complex>
 3 | 
 4 | void test_a();
 5 | void test_b();
 6 | 
 7 | int main()
 8 | {
 9 |   test_a();
10 |   test_b();
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/linking/two_identical_templates/test_a.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <complex>
 3 | 
 4 | template<typename T>
 5 | void test_map()
 6 | {
 7 |   std::cout << "map(complex<>)" << std::endl;
 8 |   std::complex<T> a(0.2, 1), a_check;
 9 | #pragma omp target map(from : a_check)
10 |   {
11 |     a_check = a;
12 |   }
13 | }
14 | 
15 | void test_a()
16 | {
17 |   test_map<float>();
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/linking/two_identical_templates/test_b.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <complex>
 3 | 
 4 | template<typename T>
 5 | void test_map()
 6 | {
 7 |   std::cout << "map(complex<>)" << std::endl;
 8 |   std::complex<T> a(0.2, 1), a_check;
 9 | #pragma omp target map(from : a_check)
10 |   {
11 |     a_check = a;
12 |   }
13 | }
14 | 
15 | void test_b()
16 | {
17 |   test_map<float>();
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/map/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX AND CXX_OFFLOAD_RUNTIME_OKAY)
 2 |   foreach(NAME pointer_api this_with_virtual struct_with_const first_private_this_wrong)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.map_${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 
15 | if (ENABLE_Fortran)
16 |   foreach(NAME implicit_map_alloc)
17 |     set(FULLNAME ${NAME})
18 |     set(EXE_NAME f.map_${FULLNAME})
19 |     add_executable(${EXE_NAME} ${FULLNAME}.f90)
20 |     target_link_libraries(${EXE_NAME} dummy_openmp_runtime)
21 |     set_target_properties(${EXE_NAME} PROPERTIES
22 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
23 |     add_test(NAME ${EXE_NAME}
24 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
25 |              WORKING_DIRECTORY ${TESTS_BINDIR})
26 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran)
27 |   endforeach()
28 | endif()
29 | 


--------------------------------------------------------------------------------
/tests/map/check_transfer.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | 
 3 | int main()
 4 | {
 5 |   int in = 1;
 6 |   int out = 0;
 7 |   int n = 2;
 8 |   for(int i = 0; i < n; i++)
 9 |   {
10 |     #pragma omp target map(from:out)
11 |     {
12 |       out = in * 2;
13 |     }
14 |   }
15 |   assert( out == in * 2 );
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/map/declare_target_global.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <omp.h>
 4 | 
 5 | #pragma omp declare target
 6 | int * arr;
 7 | 
 8 | void foo(int i)
 9 | {
10 |   printf("device address %d %p\n", i, arr);
11 | }
12 | #pragma omp end declare target
13 | 
14 | int main(void)
15 | {
16 |   // Allocate array and set to zero
17 |   int len = 3;
18 |   arr = (int *) calloc( len, sizeof(int) );
19 | 
20 |   printf("arr omp_target_is_present %d\n", omp_target_is_present(arr, 0));
21 |   #pragma omp target data use_device_ptr(arr)
22 |   {
23 |     printf("arr initial device address %p\n", arr);
24 |   }
25 | 
26 |   #pragma omp target data map(tofrom: arr[:len])
27 |   {
28 |     printf("arr host address %p\n", arr);
29 |     printf("arr omp_target_is_present %d\n", omp_target_is_present(arr, 0));
30 |     #pragma omp target data use_device_ptr(arr)
31 |     {
32 |       printf("arr device address inside map %p\n", arr);
33 |     }
34 | 
35 |     #pragma omp target teams distribute parallel for
36 |     for( int i = 0; i < len; i++)
37 |       foo(i);
38 |   }
39 | 
40 |   printf("arr omp_target_is_present %d\n", omp_target_is_present(arr, 0));
41 |   #pragma omp target data use_device_ptr(arr)
42 |   {
43 |     printf("arr final device address %p\n", arr);
44 |   }
45 | 
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/map/first_private_this_wrong.cpp:
--------------------------------------------------------------------------------
 1 | //#include <omp.h>
 2 | #include <iostream>
 3 | 
 4 | template <typename T> struct base {
 5 |   T abc[20];
 6 |   T de[200];
 7 |   T compute() { return de[0]; }
 8 | };
 9 | 
10 | template <typename T> struct foo : protected base<T> {
11 |   foo() {
12 | #pragma omp target enter data map(to : this[:1])
13 |   }
14 |   ~foo() {
15 | #pragma omp target exit data map(delete : this[:1])
16 |   }
17 | 
18 |   void target_compute() {
19 |     //std::cout << "  ***** is this[:1] mapped? " << omp_target_is_present(this, omp_get_default_device()) << std::endl;
20 | #pragma omp target teams
21 | #pragma omp parallel
22 |     { T a = base<T>::compute(); }
23 |   }
24 | };
25 | 
26 | int main() {
27 |   foo<int> a;
28 |   a.target_compute();
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/map/implicit_map_alloc.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   implicit none
 3 |   type foobar
 4 |     real(8), dimension(:), pointer :: foo, bar
 5 |   end type
 6 |   type(foobar) :: this
 7 |   integer, parameter :: n = 1024
 8 |   integer :: i
 9 |   real(8), dimension(:), pointer :: bar_ptr
10 | 
11 |   allocate(this%foo(n), this%bar(n))
12 |   this%foo = 1d0
13 |   !$omp target enter data map(to:this%foo) map(alloc:this%bar)
14 | 
15 |   bar_ptr => this%bar
16 |   !$omp target teams distribute parallel do
17 |   do i = 1,n
18 |     bar_ptr(i) = 3d0
19 |   enddo
20 |   !$omp end target teams distribute parallel do
21 | 
22 |   !$omp target update from(this%bar)
23 | 
24 |   if (all(this%foo < this%bar)) then
25 |      print *,"Success!"
26 |   else
27 |      write(*,*) this%foo(3), this%bar(1), this%bar(2)
28 |      stop 1
29 |   endif
30 | 
31 | end program main
32 | 


--------------------------------------------------------------------------------
/tests/map/map_class_member.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | template<typename T>
 4 | struct maptest
 5 | {
 6 |   constexpr static int size = 6;
 7 |   T data[size];
 8 | 
 9 |   maptest()
10 |   {
11 |     std::cout << "before enter data\n";
12 |     #pragma omp target enter data map(alloc:data[:6])
13 |     std::cout << "done with enter data\n";
14 |   }
15 | 
16 |   ~maptest()
17 |   {
18 |     std::cout << "before exit data\n";
19 |     #pragma omp target exit data map(delete:data[:6])
20 |     std::cout << "done with exit data\n";
21 |   }
22 | };
23 | 
24 | int main()
25 | {
26 |   maptest<float> a;
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/map/map_delete_inside_data.cpp:
--------------------------------------------------------------------------------
 1 | int main()
 2 | {
 3 |   int a[100];
 4 |   #pragma omp target enter data map(alloc:a)
 5 |   #pragma omp target data map(alloc:a)
 6 |   {
 7 |     #pragma omp target exit data map(delete:a)
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/map/map_threads.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <cstdio>
 3 | 
 4 | int main()
 5 | {
 6 |   const int N = 200000000;
 7 |   std::vector<float> vec(N, 1.1);
 8 |   float* vec_ptr = vec.data();
 9 |   for (int it = 0; it < 3; it++)
10 |   {
11 |     printf("\niteration %d\n", it);
12 |     #pragma omp parallel for
13 |     for (int i = 0; i < 4; i++)
14 |     {
15 |       // first hit does the transfer, others start executing the kernel.
16 |       #pragma omp target map(to:vec_ptr[:vec.size()])
17 |       {
18 |         printf("tid %d value = %f\n", i, vec_ptr[N - 1 - i]);
19 |       }
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/tests/map/pointer_api.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <omp.h>
 3 | 
 4 | int main()
 5 | {
 6 |   int a[1000];
 7 |   std::cout << "before enter data mapped? " << omp_target_is_present(a, omp_get_default_device()) << std::endl;
 8 |   #pragma omp target enter data map(alloc:a[:1000])
 9 |   std::cout << "after enter data mapped? " << omp_target_is_present(a, omp_get_default_device()) << std::endl;
10 |   std::cout << "&a[0] mapped? " << omp_target_is_present(&a[0], omp_get_default_device()) << std::endl;
11 |   std::cout << "&a[50] mapped? " << omp_target_is_present(&a[50], omp_get_default_device()) << std::endl;
12 |   std::cout << "&a[999] mapped? " << omp_target_is_present(&a[999], omp_get_default_device()) << std::endl;
13 |   std::cout << "&a[1000] mapped? " << omp_target_is_present(&a[1000], omp_get_default_device()) << std::endl;
14 | 
15 |   int* a_ptr = a;
16 |   int* b_ptr = a + 200;
17 | 
18 |   std::cout << "host pointer " << std::endl
19 |             << "a = " << a_ptr << std::endl
20 |             << "b = " << b_ptr << std::endl;
21 | 
22 |   #pragma omp target data use_device_ptr(a_ptr, b_ptr)
23 |   {
24 |     std::cout << "device pointer " << std::endl
25 |               << "a = " << a_ptr << std::endl
26 |               << "b = " << b_ptr << std::endl;
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/map/struct_with_const.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <string>
 4 | 
 5 | struct with_const
 6 | {
 7 |   with_const()
 8 |   {
 9 |     #pragma omp target enter data map(to:this[:1])
10 |   }
11 | 
12 |   constexpr static int size = 6;
13 |   static const int b = 12;
14 |   std::string name;
15 | };
16 | 
17 | int main()
18 | {
19 |   with_const foo;
20 |   return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/map/this_with_virtual.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <iostream>
 3 | 
 4 | class Base
 5 | {
 6 | public:
 7 |   virtual void foo() const = 0;
 8 | };
 9 | 
10 | class Derived: public Base
11 | {
12 |   const int const_value = 8;
13 | 
14 | public:
15 |   Derived()
16 |   {
17 |     #pragma omp target enter data map(to: this[:1])
18 |   }
19 | 
20 |   ~Derived()
21 |   {
22 |     #pragma omp target exit data map(delete: this[:1])
23 |   }
24 | 
25 |   void foo() const override {}
26 | 
27 |   int boo()
28 |   {
29 |     int res = 0;
30 |     #pragma omp target map(from:res)
31 |     {
32 |       res = const_value;
33 |     }
34 |     return res;
35 |   }
36 | };
37 | 
38 | int main()
39 | {
40 |   Derived a;
41 |   const int res = a.boo();
42 |   std::cout << "return value " << res << " reference value " << 8 << std::endl;
43 |   assert(res == 8);
44 |   return 0;
45 | }
46 | 


--------------------------------------------------------------------------------
/tests/math/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   foreach(NAME FP_ZERO header_only modf modf_team sqrt_simd sin_cos sin_simd sincos sincos_simd sincos_simd_template modf_in_branch)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.math_${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 


--------------------------------------------------------------------------------
/tests/math/FP_ZERO.cpp:
--------------------------------------------------------------------------------
 1 | //===--- qmcpack_target_math.c --- math lib invocation inside target---------===//
 2 | //
 3 | // OpenMP API Version 4.5 Nov 2015
 4 | //
 5 | //
 6 | ////===----------------------------------------------------------------------===//
 7 | 
 8 | #include <iostream>
 9 | #include <cstdlib>
10 | #include <cmath>
11 | 
12 | #define N 1000
13 | bool failed = false;
14 | 
15 | void test_math_lib_inside_target() {
16 | 
17 |   double array[N];
18 |   int errors = 0;
19 |   
20 |   // Array initialization
21 |   for (int i = 0; i < N; ++i) {
22 |     array[i] = 0.99;
23 |   }
24 | 
25 |   int c99_zero = FP_ZERO;
26 | 
27 | #pragma omp target map(tofrom: array[0:N]) 
28 |   for (int i = 0; i < N; ++i) {
29 |     array[i] = pow((double)i,2.0);
30 |   }
31 | 
32 |   for (int i = 0; i < N; ++i) {
33 |     if(fabs(array[i] - pow((double)i,2)) >= 0.000009)
34 |     {
35 |       std::cout << "failed array[" << i << "] " << array[i] << " ref " << pow((double)i,2) << std::endl;
36 |       failed = true;
37 |     }
38 |   }
39 | }
40 | 
41 | int main() {
42 |   test_math_lib_inside_target();
43 |   return failed;
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/math/README:
--------------------------------------------------------------------------------
 1 | All three cases work if only calling C functions directly by adding -DC_ONLY flag.
 2 | clang++ -fopenmp -fopenmp-targets=nvptx64 -DC_ONLY sincos.cpp
 3 | 
 4 | When the code becomes more C++,
 5 | 
 6 | $ clang++ -fopenmp -fopenmp-targets=nvptx64 sincos.cpp
 7 | nvlink error   : Undefined reference to '_ZL6sincosdPdS_' in '/tmp/sincos-b90610.cubin'
 8 | clang-11: error: nvlink command failed with exit code 255 (use -v to see invocation)
 9 | 
10 | $ clang++ -fopenmp -fopenmp-targets=nvptx64 sin_cos.cpp 
11 | fatal error: error in backend: Cannot select: t11: f32 = fsin t10
12 |   t10: f32,ch = load<(dereferenceable load 4 from %ir.__x.addr)> t9, FrameIndex:i64<0>, undef:i64
13 |     t8: i64 = FrameIndex<0>
14 |     t3: i64 = undef
15 | In function: _ZSt3sinf
16 | clang-11: error: clang frontend command failed with exit code 70 (use -v to see invocation)
17 | 
18 | $ clang++ -fopenmp -fopenmp-targets=nvptx64 modf.cpp 
19 | nvlink error   : Undefined reference to '_ZL4modfdPd' in '/tmp/modf-796c89.cubin'
20 | nvlink error   : Undefined reference to 'modff' in '/tmp/modf-796c89.cubin'
21 | clang-11: error: nvlink command failed with exit code 255 (use -v to see invocation)
22 | 


--------------------------------------------------------------------------------
/tests/math/header_only.cpp:
--------------------------------------------------------------------------------
1 | #include <cstdio>
2 | #include <cmath>
3 | int main()
4 | {
5 |   printf("SUCCESS\n");
6 |   return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/math/modf.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | template<typename T>
 4 | void test_modf(T x)
 5 | {
 6 |   T dx;
 7 |   int intx;
 8 | 
 9 |   #pragma omp target map(from: intx, dx)
10 |   {
11 |     T ipart;
12 |     dx = std::modf(x, &ipart);
13 |     intx = static_cast<int>(ipart);
14 |   }
15 | }
16 | 
17 | int main()
18 | {
19 | 
20 | #if !defined(C_ONLY)
21 |   test_modf<double>(1.0);
22 |   test_modf<float>(1.0);
23 | #endif
24 | 
25 |   #pragma omp target
26 |   {
27 |     double intpart, res;
28 |     res = modf(1.1, &intpart);
29 |   }
30 | 
31 |   #pragma omp target
32 |   {
33 |     float intpart, res;
34 |     res = modff(1.1f, &intpart);
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/math/modf_in_branch.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <cstdio>
 3 | #include <stdexcept>
 4 | 
 5 | template<typename T>
 6 | int foo(T r, T DeltaRInv)
 7 | {
 8 |     r *= DeltaRInv;
 9 |     T ipart;
10 |     //printf("modf input %lf ptr %p \n", r, &ipart);
11 |     const T t   = std::modf(r, &ipart);
12 |     const int i = (int)ipart;
13 |     return i;
14 | }
15 | 
16 | int main()
17 | {
18 |   int arr[20];
19 |   #pragma omp target teams distribute map(arr)
20 |   for(int i = 0; i < 2; i++)
21 |   {
22 |     double r = 1.3;
23 |     double DeltaRInv = 0.3;
24 |     #pragma omp parallel for
25 |     for(int j = 0; j < 10; j++)
26 |     {
27 |       if (r * j > 5)
28 |         arr[i*10 + j] = foo(r * j, DeltaRInv);
29 |       else
30 |         arr[i*10 + j] = 0;
31 |     }
32 |   }
33 | 
34 |   for(int i = 0; i < 2; i++)
35 |   {
36 |     double r = 1.3;
37 |     double DeltaRInv = 0.3;
38 |     for(int j = 0; j < 10; j++)
39 |       if (r * j > 5)
40 |       {
41 |         if (arr[i*10 + j] != foo(r * j, DeltaRInv)) throw std::runtime_error("Wrong foo return value!");
42 |       }
43 |       else
44 |       {
45 |         if (arr[i*10 + j] != 0) throw std::runtime_error("should be zero!");
46 |       }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/math/modf_team.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cmath>
 3 | #include <limits>
 4 | 
 5 | template<typename T, typename TRESIDUAL>
 6 | inline void getSplineBound(T x, TRESIDUAL& dx, int& ind, int nmax)
 7 | {
 8 |   // lower bound
 9 |   if (x < 0)
10 |   {
11 |     ind = 0;
12 |     dx  = T(0);
13 |   }
14 |   else
15 |   {
16 | #if defined(USE_FLOOR)
17 |     T ipart = std::floor(x);
18 |     dx      = x - ipart;
19 | #else
20 |     T ipart;
21 |     dx = std::modf(x, &ipart);
22 | #endif
23 |     ind = static_cast<int>(ipart);
24 |     // upper bound
25 |     if (ind > nmax)
26 |     {
27 |       ind = nmax;
28 |       dx  = T(1) - std::numeric_limits<T>::epsilon();
29 |     }
30 |   }
31 | }
32 | 
33 | int main()
34 | {
35 |   using T = float;
36 |   T x     = 1.25;
37 |   T dx    = 0;
38 |   int ind = 0;
39 | #pragma omp target map(from : dx, ind)
40 |   {
41 |     getSplineBound(x, dx, ind, 10);
42 |   }
43 | 
44 |   if (x != T(dx + ind))
45 |   {
46 |     std::cout << "Error x = " << x << " dx = " << dx << " ind " << ind << std::endl;
47 |     return 1;
48 |   }
49 |   std::cout << "omp target passed!" << std::endl;
50 | 
51 |   constexpr int N = 100;
52 |   T x_arr[N];
53 |   T dx_arr[N];
54 |   int ind_arr[N];
55 |   for (int i = 0; i < N; i++)
56 |     x_arr[i] = (i + 1) * 0.25;
57 | 
58 | #pragma omp target teams distribute map(to : x_arr[:N]) map(from : dx_arr[:N], ind_arr[:N])
59 |   for (int i = 0; i < N; i++)
60 |     getSplineBound(x_arr[i], dx_arr[i], ind_arr[i], 24);
61 | 
62 |   for (int i = 0; i < N - 1; i++)
63 |     if (x_arr[i] != T(dx_arr[i] + ind_arr[i]))
64 |     {
65 |       std::cout << "Error team = " << i << " x = " << x_arr[i] << " dx = " << dx_arr[i] << " ind " << ind_arr[i]
66 |                 << std::endl;
67 |       return 1;
68 |     }
69 | 
70 |   //special case
71 |   {
72 |     const int i = N - 1;
73 |     if (ind_arr[i] != 24)
74 |     {
75 |       std::cout << "Error team = " << i << " x = " << x_arr[i] << " dx = " << dx_arr[i] << " ind " << ind_arr[i]
76 |                 << std::endl;
77 |       return 1;
78 |     }
79 |   }
80 |   std::cout << "omp target teams distribute passed!" << std::endl;
81 | }
82 | 


--------------------------------------------------------------------------------
/tests/math/sin_cos.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iostream>
 3 | 
 4 | bool failed = false;
 5 | 
 6 | template<typename T>
 7 | void test_sin_cos(T x)
 8 | {
 9 |   T res_sin, res_cos;
10 | 
11 |   #pragma omp target map(from: res_sin, res_cos)
12 |   {
13 |     res_sin = std::sin(x);
14 |     res_cos = std::cos(x);
15 |   }
16 | 
17 |   if (res_sin != std::sin(x))
18 |   {
19 |     std::cout << "sincos sin part " << res_sin << " std::sin " << std::sin(x) << std::endl;
20 |     failed = true;
21 |   }
22 |   if (res_cos != std::cos(x))
23 |   {
24 |     std::cout << "sincos cos part " << res_cos << " std::cos " << std::cos(x) << std::endl;
25 |     failed = true;
26 |   }
27 | }
28 | 
29 | int main()
30 | {
31 | 
32 | #if !defined(C_ONLY)
33 |   test_sin_cos<double>(0.0);
34 |   test_sin_cos<float>(0.0);
35 | #endif
36 | 
37 |   #pragma omp target
38 |   {
39 |     double res;
40 |     res = sin(1.0);
41 |   }
42 | 
43 |   #pragma omp target
44 |   {
45 |     float res;
46 |     res = sinf(1.0f);
47 |   }
48 | 
49 |   return failed;
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/math/sin_simd.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <cassert>
 3 | #include <iostream>
 4 | #include <cstdio>
 5 | #include <iomanip>
 6 | 
 7 | constexpr size_t N = 128;
 8 | 
 9 | template<typename T>
10 | void test_sin_simd()
11 | {
12 |   T phase[N], sinval[N];
13 |   phase[0] = 0.0;
14 |   phase[1] = 0.1;
15 |   phase[2] = 0.2;
16 |   phase[3] = 0.3;
17 | 
18 | #pragma omp simd
19 |   for(int i = 0; i < N; i++)
20 |   {
21 |     sinval[i] = std::sin(phase[i]);
22 |     //std::cout << std::setprecision(14) << sinval[i] << std::endl;
23 |   } 
24 | 
25 |   std::cout << std::setprecision(14);
26 |   std::cout << "sinval[0] " << sinval[0] << " ref " << 0 << std::endl;
27 |   std::cout << "sinval[1] " << sinval[1] << " ref " << 0.099833416646828 << std::endl;
28 |   std::cout << "sinval[2] " << sinval[2] << " ref " << 0.19866933079506 << std::endl;
29 |   std::cout << "sinval[3] " << sinval[3] << " ref " << 0.29552020666134 << std::endl;
30 |   assert( std::fabs(sinval[0]) < 1e-6);
31 |   assert( std::fabs(sinval[1] - 0.099833416646828) < 1e-6);
32 |   assert( std::fabs(sinval[2] - 0.19866933079506) < 1e-6);
33 |   assert( std::fabs(sinval[3] - 0.29552020666134) < 1e-6);
34 | }
35 | 
36 | int main()
37 | {
38 |   test_sin_simd<float>();
39 |   test_sin_simd<double>();
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/math/sincos.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iostream>
 3 | 
 4 | bool failed = false;
 5 | 
 6 | // single precision wrapper
 7 | inline void sincos(float x, float* __restrict__ sin, float* __restrict__ cos)
 8 | {
 9 |   sincosf(x, sin, cos);
10 | }
11 | 
12 | template<typename T>
13 | void test_sincos(T x)
14 | {
15 |   T res_sin, res_cos;
16 | 
17 |   #pragma omp target map(from: res_sin, res_cos)
18 |   {
19 |     sincos(x, &res_sin, &res_cos);
20 |   }
21 | 
22 |   if (res_sin != std::sin(x))
23 |   {
24 |     std::cout << "sincos sin part " << res_sin << " std::sin " << std::sin(x) << std::endl;
25 |     failed = true;
26 |   }
27 |   if (res_cos != std::cos(x))
28 |   {
29 |     std::cout << "sincos cos part " << res_cos << " std::cos " << std::cos(x) << std::endl;
30 |     failed = true;
31 |   }
32 | }
33 | 
34 | int main(int argc, char **argv)
35 | {
36 | 
37 | #if !defined(C_ONLY)
38 |   test_sincos<double>(0.0);
39 |   test_sincos<float>(0.0);
40 | #endif
41 | 
42 |   #pragma omp target
43 |   {
44 |     double s, c;
45 |     sincos(0, &s, &c);
46 |   }
47 | 
48 |   #pragma omp target 
49 |   {
50 |     float s, c;
51 |     sincosf(0.f, &s, &c);
52 |   }
53 | 
54 |   return failed;
55 | }
56 | 


--------------------------------------------------------------------------------
/tests/math/sincos_simd.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <cassert>
 3 | #include <iostream>
 4 | #include <cstdio>
 5 | #include <iomanip>
 6 | 
 7 | constexpr size_t N = 128;
 8 | 
 9 | int main()
10 | {
11 |   double phase[N], sinval[N], cosval[N];
12 |   phase[0] = 0.0;
13 |   phase[1] = 0.1;
14 |   phase[2] = 0.2;
15 |   phase[3] = 0.3;
16 | 
17 | #pragma omp simd
18 |   for(int i = 0; i < N; i++)
19 |   {
20 |     sincos(phase[i], &sinval[i], &cosval[i]);
21 |     //std::cout << std::setprecision(14) << sinval[i] << " " << cosval[i] << std::endl;
22 |   } 
23 | 
24 |   std::cout << std::setprecision(14);
25 |   std::cout << "sinval[0] " << sinval[0] << " ref " << 0 << std::endl;
26 |   std::cout << "sinval[1] " << sinval[1] << " ref " << 0.099833416646828 << std::endl;
27 |   std::cout << "sinval[2] " << sinval[2] << " ref " << 0.19866933079506 << std::endl;
28 |   std::cout << "sinval[3] " << sinval[3] << " ref " << 0.29552020666134 << std::endl;
29 |   std::cout << "cosval[0] " << cosval[0] << " ref " << 1 << std::endl;
30 |   std::cout << "cosval[1] " << cosval[1] << " ref " << 0.99500416527803 << std::endl;
31 |   std::cout << "cosval[2] " << cosval[2] << " ref " << 0.98006657784124 << std::endl;
32 |   std::cout << "cosval[3] " << cosval[3] << " ref " << 0.95533648912561 << std::endl;
33 |   assert( std::fabs(sinval[0]) < 1e-6);
34 |   assert( std::fabs(sinval[1] - 0.099833416646828) < 1e-6);
35 |   assert( std::fabs(sinval[2] - 0.19866933079506) < 1e-6);
36 |   assert( std::fabs(sinval[3] - 0.29552020666134) < 1e-6);
37 |   assert( std::fabs(cosval[0] - 1) < 1e-6);
38 |   assert( std::fabs(cosval[1] - 0.99500416527803) < 1e-6);
39 |   assert( std::fabs(cosval[2] - 0.98006657784124) < 1e-6);
40 |   assert( std::fabs(cosval[3] - 0.95533648912561) < 1e-6);
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/math/sincos_simd_template.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <complex>
 3 | #include <cassert>
 4 | #include <iostream>
 5 | #include <cstdio>
 6 | #include <iomanip>
 7 | 
 8 | constexpr size_t N = 128;
 9 | 
10 | inline void sincos(float phi, float* __restrict__ s, float* __restrict__ c)
11 | {
12 |   sincosf(phi, s, c);
13 | }
14 | 
15 | template<typename T>
16 | void test_sincos()
17 | {
18 |   T phase[N];
19 |   std::complex<T> scval[N];
20 |   for(int i = 0; i < N; i++)
21 |     phase[i] = 0.1 * i;
22 | 
23 |   T sum_r(0), sum_i(0);
24 |   for(int i = 0; i < N; i++)
25 |   {
26 |     T s, c;
27 |     sincos(phase[i], &s, &c);
28 |     scval[i] = {s, c};
29 |     sum_r += s;
30 |     sum_i += c;
31 |   } 
32 | 
33 |   std::cout << std::setprecision(14) << "--------------------------" << std::endl;
34 |   std::cout << "sinval[0] " << scval[0].real() << " ref " << 0 << std::endl;
35 |   std::cout << "sinval[1] " << scval[1].real() << " ref " << 0.099833416646828 << std::endl;
36 |   std::cout << "sinval[2] " << scval[2].real() << " ref " << 0.19866933079506 << std::endl;
37 |   std::cout << "sinval[3] " << scval[3].real() << " ref " << 0.29552020666134 << std::endl;
38 |   std::cout << "cosval[0] " << scval[0].imag() << " ref " << 1 << std::endl;
39 |   std::cout << "cosval[1] " << scval[1].imag() << " ref " << 0.99500416527803 << std::endl;
40 |   std::cout << "cosval[2] " << scval[2].imag() << " ref " << 0.98006657784124 << std::endl;
41 |   std::cout << "cosval[3] " << scval[3].imag() << " ref " << 0.95533648912561 << std::endl;
42 |   assert( std::fabs(scval[0].real()) < 1e-6);
43 |   assert( std::fabs(scval[1].real() - 0.099833416646828) < 1e-6);
44 |   assert( std::fabs(scval[2].real() - 0.19866933079506) < 1e-6);
45 |   assert( std::fabs(scval[3].real() - 0.29552020666134) < 1e-6);
46 |   assert( std::fabs(scval[0].imag() - 1) < 1e-6);
47 |   assert( std::fabs(scval[1].imag() - 0.99500416527803) < 1e-6);
48 |   assert( std::fabs(scval[2].imag() - 0.98006657784124) < 1e-6);
49 |   assert( std::fabs(scval[3].imag() - 0.95533648912561) < 1e-6);
50 | 
51 |   std::cout << "sum_r " << sum_r << " sum_i " << sum_i << std::endl;
52 |   assert( std::fabs(sum_r - 0.1556929974475) < 1e-4);
53 |   assert( std::fabs(sum_i - 2.3267523980062) < 1e-4);
54 | }
55 | 
56 | int main()
57 | {
58 |   test_sincos<float>();
59 |   test_sincos<double>();
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/math/sqrt_simd.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <cassert>
 3 | #include <iostream>
 4 | #include <cstdio>
 5 | #include <iomanip>
 6 | 
 7 | constexpr size_t N = 128;
 8 | 
 9 | template<typename T>
10 | void test_sqrt_simd()
11 | {
12 |   T phase[N], sqrtval[N];
13 |   phase[0] = 0.0;
14 |   phase[1] = 0.1;
15 |   phase[2] = 0.2;
16 |   phase[3] = 0.3;
17 | 
18 | #pragma omp simd
19 |   for(int i = 0; i < N; i++)
20 |   {
21 |     sqrtval[i] = std::sqrt(phase[i]);
22 |     //std::cout << std::setprecision(14) << sqrtval[i] << std::endl;
23 |   } 
24 | 
25 |   std::cout << std::setprecision(14);
26 |   std::cout << "sqrtval[0] " << sqrtval[0] << " ref " << 0 << std::endl;
27 |   std::cout << "sqrtval[1] " << sqrtval[1] << " ref " << 0.31622776601684 << std::endl;
28 |   std::cout << "sqrtval[2] " << sqrtval[2] << " ref " << 0.44721359549996 << std::endl;
29 |   std::cout << "sqrtval[3] " << sqrtval[3] << " ref " << 0.54772255750517 << std::endl;
30 |   assert( std::fabs(sqrtval[0]) < 1e-6);
31 |   assert( std::fabs(sqrtval[1] - 0.31622776601684) < 1e-6);
32 |   assert( std::fabs(sqrtval[2] - 0.44721359549996) < 1e-6);
33 |   assert( std::fabs(sqrtval[3] - 0.54772255750517) < 1e-6);
34 | }
35 | 
36 | int main()
37 | {
38 |   test_sqrt_simd<float>();
39 |   test_sqrt_simd<double>();
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/omphost/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   foreach(NAME host_bug_libomp)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 


--------------------------------------------------------------------------------
/tests/omphost/README.md:
--------------------------------------------------------------------------------
 1 | Bug report
 2 | https://bugs.llvm.org/show_bug.cgi?id=42393
 3 | 
 4 | ```
 5 | $ icpx -fiopenmp -fopenmp-targets=spir64 debug.cpp
 6 | 
 7 | $ OMP_NUM_THREADS=2 OMP_TARGET_OFFLOAD=MANDATORY ./a.out
 8 | tid = 1
 9 |   0  1  2  3
10 | tid = 0
11 |   0  1  2  3
12 | 
13 | $ OMP_NUM_THREADS=2 OMP_TARGET_OFFLOAD=DISABLED ./a.out
14 | tid = 0
15 |   0  1  0  0
16 | tid = 1
17 |   0  0  2  3
18 | 
19 | $ icpx -fiopenmp debug.cpp
20 | tid = 0
21 |   0  1  0  0
22 | tid = 1
23 |   0  0  2  3
24 | ```
25 | 


--------------------------------------------------------------------------------
/tests/omphost/host_bug_libomp.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_thread_num() { return 0; }
 6 | int omp_get_num_threads() { return 1; }
 7 | #endif
 8 | 
 9 | int main()
10 | {
11 |   const int size = 4;
12 |   int wrong_counts = 0;
13 |   #pragma omp parallel reduction(+:wrong_counts)
14 |   {
15 |     int A[size];
16 |     for(int i = 0; i < size; i++)
17 |       A[i] = 0;
18 | 
19 |     #pragma omp target teams distribute map(tofrom: A[:size])
20 |     for(int i = 0; i < size; i++)
21 |     {
22 |       A[i] = i;
23 |     }
24 | 
25 |     #pragma omp critical
26 |     {
27 |       std::cout << "tid = " << omp_get_thread_num() << std::endl;
28 |       for(int i = 0; i < size; i++)
29 |       {
30 |         if (A[i] != i) wrong_counts++;
31 |         std::cout << "  " << A[i];
32 |       }
33 |       std::cout << std::endl;
34 |     }
35 |   }
36 | 
37 |   if (wrong_counts)
38 |     std::cout << "Wrong!" << std::endl;
39 |   else
40 |     std::cout << "Right!" << std::endl;
41 |   return wrong_counts;
42 | }
43 | 


--------------------------------------------------------------------------------
/tests/private/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   foreach(NAME teams_private__distribute _teams_private__distribute teams_distribute_private teams__distribute_private _teams_distribute_private _teams__distribute_private teams_distribute_parallel_for_private)
 3 |     set(FULLNAME target_${NAME})
 4 |     set(EXE_NAME cxx.${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 
15 | if (ENABLE_Fortran)
16 |   foreach(NAME teams_distribute_parallel_for_private teams_distribute_private local_block)
17 |     set(FULLNAME target_${NAME})
18 |     set(EXE_NAME f.${FULLNAME})
19 |     add_executable(${EXE_NAME} ${FULLNAME}.f90)
20 |     target_link_libraries(${EXE_NAME} dummy_openmp_runtime)
21 |     set_target_properties(${EXE_NAME} PROPERTIES
22 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
23 |     add_test(NAME ${EXE_NAME}
24 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
25 |              WORKING_DIRECTORY ${TESTS_BINDIR})
26 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS fortran)
27 |   endforeach()
28 | endif()
29 | 


--------------------------------------------------------------------------------
/tests/private/run_all.sh:
--------------------------------------------------------------------------------
 1 | CC=gcc
 2 | CC_FLAGS="-O3 -fopenmp -foffload=nvptx-none"
 3 | 
 4 | #CC=clang
 5 | #CC_FLAGS="-O3 -fopenmp -fopenmp-targets=nvptx64"
 6 | 
 7 | for name in *.c
 8 | do
 9 |   echo Testing $name
10 |   $CC $CC_FLAGS $name
11 |   ./a.out
12 |   echo
13 | done
14 | 


--------------------------------------------------------------------------------
/tests/private/target__teams__distribute_private.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | #endif
 7 | 
 8 | int main()
 9 | {
10 |   const int Nteams = 2;
11 |   void* pointer[Nteams];
12 |   int team_ID[Nteams];
13 |   float a;
14 |   float* a_p;
15 |   #pragma omp target map(from:pointer[:Nteams], a_p, team_ID[:Nteams])
16 |   #pragma omp teams num_teams(Nteams)
17 |   {
18 |     a_p = &a;
19 |     #pragma omp distribute private(a)
20 |     for(int i = 0; i<Nteams; i++)
21 |     {
22 |       team_ID[i] = omp_get_team_num();
23 |       pointer[i] = &a;
24 |     }
25 |   }
26 | 
27 |   printf("host pointer = %p\n", &a);
28 |   printf("device global pointer = %p\n", a_p);
29 |   for(int i = 0; i<Nteams; i++)
30 |     printf("pointer[%d] = %p, team id %d\n", i, pointer[i], team_ID[i]);
31 |   return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/private/target__teams_distribute_private.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | #endif
 7 | 
 8 | int main()
 9 | {
10 |   const int Nteams = 2;
11 |   void* pointer[Nteams];
12 |   int team_ID[Nteams];
13 |   float a;
14 |   #pragma omp target map(from:pointer[:Nteams], team_ID[:Nteams])
15 |   {
16 |     #pragma omp teams distribute num_teams(Nteams) private(a)
17 |     for(int i = 0; i<Nteams; i++)
18 |     {
19 |       team_ID[i] = omp_get_team_num();
20 |       pointer[i] = &a;
21 |     }
22 |   }
23 | 
24 |   printf("host pointer = %p\n", &a);
25 |   for(int i = 0; i<Nteams; i++)
26 |     printf("pointer[%d] = %p, team id %d\n", i, pointer[i], team_ID[i]);
27 |   return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/private/target__teams_private__distribute.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | #endif
 7 | 
 8 | int main()
 9 | {
10 |   const int Nteams = 2;
11 |   void* pointer[Nteams];
12 |   int team_ID[Nteams];
13 |   float a;
14 |   #pragma omp target map(from:pointer[:Nteams], team_ID[:Nteams])
15 |   #pragma omp teams num_teams(Nteams) private(a)
16 |   {
17 |     #pragma omp distribute
18 |     for(int i = 0; i<Nteams; i++)
19 |     {
20 |       team_ID[i] = omp_get_team_num();
21 |       pointer[i] = &a;
22 |     }
23 |   }
24 | 
25 |   printf("host pointer = %p\n", &a);
26 |   for(int i = 0; i<Nteams; i++)
27 |     printf("pointer[%d] = %p, team id %d\n", i, pointer[i], team_ID[i]);
28 |   return 0;
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/private/target_local_block.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use omp_lib
 3 |   implicit none
 4 |   integer :: a = 10
 5 |   integer :: collect_a = 0
 6 |   integer, parameter :: n = 1024
 7 | 
 8 |   !$omp target teams map(tofrom: a, collect_a)
 9 |   block
10 |   integer :: a = 20
11 |   if (omp_get_team_num() == 0) collect_a = a
12 |   end block
13 |   !$omp end target teams
14 | 
15 |   if (collect_a == 20) then
16 |      print *,"Success!"
17 |   else
18 |      write(*,*) "collect_a expected 20, now = ", collect_a
19 |      stop 1
20 |   endif
21 | 
22 |   if (a == 10) then
23 |      print *,"Success!"
24 |   else
25 |      write(*,*) "a expected 10, now = ", a
26 |      stop 1
27 |   endif
28 | 
29 | end program main
30 | 


--------------------------------------------------------------------------------
/tests/private/target_teams__distribute_private.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | #endif
 7 | 
 8 | int main()
 9 | {
10 |   const int Nteams = 2;
11 |   void* pointer[Nteams];
12 |   int team_ID[Nteams];
13 |   float a;
14 |   float* a_p;
15 |   #pragma omp target teams num_teams(Nteams) map(from:pointer[:Nteams], a_p, team_ID[:Nteams])
16 |   {
17 |     a_p = &a;
18 |     #pragma omp distribute private(a)
19 |     for(int i = 0; i<Nteams; i++)
20 |     {
21 |       team_ID[i] = omp_get_team_num();
22 |       pointer[i] = &a;
23 |     }
24 |   }
25 | 
26 |   printf("host pointer = %p\n", &a);
27 |   printf("device global pointer = %p\n", a_p);
28 |   for(int i = 0; i<Nteams; i++)
29 |     printf("pointer[%d] = %p, team id %d\n", i, pointer[i], team_ID[i]);
30 |   return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/private/target_teams_distribute_parallel_for_private.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | int omp_get_thread_num() { return 1; }
 7 | #endif
 8 | 
 9 | int main()
10 | {
11 |   const int Nteams = 3;
12 |   const int Nthreads = 3;
13 |   const int Ntot = Nteams*Nthreads;
14 |   void* pointer[Ntot];
15 |   int team_ID[Ntot];
16 |   int thread_ID[Ntot];
17 |   float a;
18 |   #pragma omp target teams distribute parallel for num_teams(Nteams) thread_limit(Nthreads) private(a) map(from:pointer[:Ntot], team_ID[:Ntot], thread_ID[:Ntot])
19 |   for(int i = 0; i<Ntot; i++)
20 |   {
21 |     team_ID[i] = omp_get_team_num();
22 |     thread_ID[i] = omp_get_thread_num();
23 |     pointer[i] = &a;
24 |   }
25 | 
26 |   printf("host pointer = %p\n", &a);
27 |   for(int i = 0; i<Ntot-1; i++)
28 |     for(int j = i+1; j<Ntot; j++)
29 |       if((team_ID[i] != team_ID[j] || thread_ID[i]!=thread_ID[j]) && pointer[i] == pointer[j])
30 |         printf("WARNING identical pointer = %p, (team,thread) id (%d,%d) and (%d,%d)\n", pointer[i], team_ID[i], thread_ID[i], team_ID[j], thread_ID[j]);
31 |   return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/private/target_teams_distribute_parallel_for_private.f90:
--------------------------------------------------------------------------------
 1 | program test_omp
 2 |   use iso_c_binding, only: c_associated, c_loc, c_ptr, c_long
 3 |   use omp_lib, only: omp_get_thread_num, omp_get_team_num
 4 |   implicit none
 5 |   integer, parameter :: Nteams = 3
 6 |   integer, parameter :: Nthreads = 3
 7 |   integer, parameter :: Ntot = Nteams * Nthreads
 8 |   type(c_ptr) :: ptr_list(Ntot)
 9 |   integer :: team_ID(Ntot)
10 |   integer :: thread_ID(Ntot)
11 |   real*4, target :: a
12 |   integer :: i,j
13 |   integer(c_long) :: ptr_val
14 | 
15 |   !$omp target teams distribute parallel do num_teams(Nteams) thread_limit(Nthreads) private(a) &
16 |   !$omp map(from:ptr_list, team_ID, thread_ID)
17 |   do i=1, Ntot
18 |     team_ID(i) = omp_get_team_num();
19 |     thread_ID(i) = omp_get_thread_num();
20 |     ptr_list(i) = c_loc(a)
21 |   enddo
22 | 
23 |   ptr_val = transfer(c_loc(a), ptr_val)
24 |   write(*,*) "host pointer = ", ptr_val
25 |   do i=1, Ntot-1
26 |     ptr_val = transfer(ptr_list(i), ptr_val)
27 |     do j=i+1, Ntot
28 |       if((team_ID(i) /= team_ID(j) .or. thread_ID(i) /= thread_ID(j)) .and. c_associated(ptr_list(i), ptr_list(j))) then
29 |         write(*,*) "WARNING identical pointer = ", ptr_val, &
30 |                  & " (team,thread) id (", team_ID(i), ",", thread_ID(i), ")", &
31 |                  & " and (", team_ID(j), ",", thread_ID(j), ")";
32 |       endif
33 |     enddo
34 |   enddo
35 | end program
36 | 


--------------------------------------------------------------------------------
/tests/private/target_teams_distribute_private.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | #endif
 7 | 
 8 | int main()
 9 | {
10 |   const int Nteams = 3;
11 |   void* pointer[Nteams];
12 |   int team_ID[Nteams];
13 |   float a;
14 |   #pragma omp target teams distribute num_teams(Nteams) private(a) map(from:pointer[:Nteams], team_ID[:Nteams])
15 |   for(int i = 0; i<Nteams; i++)
16 |   {
17 |     team_ID[i] = omp_get_team_num();
18 |     pointer[i] = &a;
19 |   }
20 | 
21 |   printf("host pointer = %p\n", &a);
22 |   for(int i = 0; i<Nteams-1; i++)
23 |     for(int j = i+1; j<Nteams; j++)
24 |       if(team_ID[i] != team_ID[j] && pointer[i] == pointer[j])
25 |         printf("WARNING identical pointer[%d] = %p, team id %d and %d\n", i, pointer[i], team_ID[i], team_ID[j]);
26 |   return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/private/target_teams_distribute_private.f90:
--------------------------------------------------------------------------------
 1 | program test_omp
 2 |   use iso_c_binding, only: c_associated, c_loc, c_ptr, c_long
 3 |   use omp_lib
 4 |   implicit none
 5 |   integer, parameter :: Nteams = 3
 6 |   type(c_ptr) :: ptr_list(Nteams)
 7 |   integer :: team_ID(Nteams)
 8 |   real*4, target :: a
 9 |   integer :: i,j
10 |   integer(c_long) :: ptr_val
11 | 
12 |   !$omp target teams distribute num_teams(Nteams) private(a) map(from:ptr_list, team_ID)
13 |   do i=1, Nteams
14 |     team_ID(i) = omp_get_team_num();
15 |     ptr_list(i) = c_loc(a)
16 |   enddo
17 | 
18 |   ptr_val = transfer(c_loc(a), ptr_val)
19 |   write(*,*) "host pointer = ", ptr_val
20 |   do i=1, Nteams-1
21 |     ptr_val = transfer(ptr_list(i), ptr_val)
22 |     do j=i+1, Nteams
23 |       if(team_ID(i) /= team_ID(j) .and. c_associated(ptr_list(i), ptr_list(j))) then
24 |         write(*,*) "WARNING identical pointer = ", ptr_val, " team id ", team_ID(i), " and ", team_ID(j);
25 |       endif
26 |     enddo
27 |   enddo
28 | end program
29 | 


--------------------------------------------------------------------------------
/tests/private/target_teams_private__distribute.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | int omp_get_team_num() { return 1; }
 6 | #endif
 7 | 
 8 | int main()
 9 | {
10 |   const int Nteams = 2;
11 |   void* pointer[Nteams];
12 |   int team_ID[Nteams];
13 |   float a;
14 |   #pragma omp target teams num_teams(Nteams) private(a) map(from:pointer[:Nteams], team_ID[:Nteams])
15 |   {
16 |     #pragma omp distribute
17 |     for(int i = 0; i<Nteams; i++)
18 |     {
19 |       team_ID[i] = omp_get_team_num();
20 |       pointer[i] = &a;
21 |     }
22 |   }
23 | 
24 |   printf("host pointer = %p\n", &a);
25 |   for(int i = 0; i<Nteams; i++)
26 |     printf("pointer[%d] = %p, team id %d\n", i, pointer[i], team_ID[i]);
27 |   return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/reduction/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   foreach(NAME array_reduction)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.reduction_${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 


--------------------------------------------------------------------------------
/tests/reduction/README.md:
--------------------------------------------------------------------------------
 1 | # GCC requres UDR
 2 | ```
 3 | g++ -O3 -fopenmp -foffload=disable array_reduction.cpp && ./a.out
 4 | g++ -O3 -fopenmp -foffload=nvptx-none -foffload-options="-lm -latomic" array_reduction.cpp && ./a.out
 5 | ```
 6 | 
 7 | # Clang allows both with and without UDR
 8 | ```
 9 | clang++ -O3 -fopenmp array_reduction.cpp && ./a.out
10 | clang++ -O3 -fopenmp -fopenmp-targets=nvptx64 array_reduction.cpp && ./a.out
11 | clang++ -O3 -fopenmp -D__NO_UDR array_reduction.cpp && ./a.out
12 | clang++ -O3 -fopenmp -fopenmp-targets=nvptx64 -D__NO_UDR array_reduction.cpp && ./a.out
13 | ```
14 | 
15 | # NVHPC disallow UDR
16 | ```
17 | nvc++  -O3 -mp -D__NO_UDR array_reduction.cpp # doesn't work
18 | nvc++  -O3 -mp=gpu -D__NO_UDR array_reduction.cpp # doesn't work
19 | ```
20 | 


--------------------------------------------------------------------------------
/tests/reduction/array_reduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | #include <iostream>
 3 | 
 4 | struct GradType
 5 | {
 6 |   float X[3]{0, 0, 0};
 7 |   float operator[](size_t i) const { return X[i]; }
 8 |   float& operator[](size_t i) { return X[i]; }
 9 | };
10 | 
11 | GradType& operator+=(GradType& a, const GradType& b)
12 | {
13 |   for (int i = 0; i < 3; i++)
14 |     a[i]+=b[i];
15 |   return a;
16 | }
17 | 
18 | #if !defined(__NO_UDR)
19 | #pragma omp declare reduction(+ : GradType : omp_out += omp_in)
20 | #endif
21 | 
22 | void test_size(size_t N)
23 | {
24 |   std::cout << std::endl << "Testing size " << N << std::endl;
25 |   GradType grads{0, 0, 0};
26 |   #pragma omp parallel for reduction(+: grads)
27 |   for (int i = 0; i<N; i++)
28 |     grads[i%3] += i*1.0;
29 |   GradType grads_saved(grads);
30 | 
31 |   #pragma omp target map(tofrom: grads)
32 |   {
33 |     #pragma omp parallel for reduction(+: grads)
34 |     for (int i = 0; i<N; i++)
35 |       grads[i%3] += i*1.0;
36 |   }
37 |   std::cout << "grads should be twice of grads_saved" << std::endl;
38 |   std::cout << "grads_saved " << grads_saved[0] << " " << grads_saved[1] << " " << grads_saved[2] << std::endl;
39 |   std::cout << "grads       " << grads[0] << " " << grads[1] << " " << grads[2] << std::endl;
40 |   if (std::abs(grads_saved[0] * 2 - grads[0]) > 1e-6 || std::abs(grads_saved[1] * 2 - grads[1]) > 1e-6 || std::abs(grads_saved[2] * 2 - grads[2]) > 1e-6)
41 |   {
42 |     std::cout << "Failed!" << std::endl;
43 |     exit(1);
44 |   }
45 |   else
46 |     std::cout << "Passed!" << std::endl;
47 | }
48 | 
49 | int main()
50 | {
51 |   std::cout << "Start testing!" << std::endl;
52 |   test_size(9);
53 |   test_size(3);
54 |   test_size(5);
55 |   test_size(7);
56 |   test_size(13);
57 |   test_size(15);
58 |   test_size(17);
59 |   test_size(25);
60 |   test_size(31);
61 |   test_size(35);
62 |   test_size(65);
63 |   test_size(8);
64 |   test_size(16);
65 |   test_size(32);
66 |   test_size(64);
67 |   test_size(128);
68 |   test_size(256);
69 |   std::cout << "End testing!" << std::endl;
70 | }
71 | 


--------------------------------------------------------------------------------
/tests/sollve_vv/sollve_vv_aomp.sh:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/SOLLVE/sollve_vv.git
 2 | # this is so we can avoid hanging
 3 | sed -i s/timeout/"timeout -s 9"/ sollve_vv/sys/scripts/run_test.sh
 4 | cd sollve_vv
 5 | OFFLOAD_FLAG="-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906"
 6 | make OMP_VERSION=4.5 CC="clang -std=c99 $OFFLOAD_FLAG" \
 7 |                      CXX="clang++ -std=c++11 $OFFLOAD_FLAG" \
 8 |                      FC="flang $OFFLOAD_FLAG" \
 9 |      VERBOSE_TESTS=1 VERBOSE=1 LOG=1 LOG_ALL=1 all 
10 | make report_summary &> 45.out
11 | make tidy
12 | make OMP_VERSION=5.0 CC="clang -std=c99 -fopenmp-version=50 $OFFLOAD_FLAG" \
13 |                      CXX="clang++ -std=c++11 -fopenmp-version=50 $OFFLOAD_FLAG" \
14 |                      FC="flang $OFFLOAD_FLAG" \
15 |      VERBOSE_TESTS=1 VERBOSE=1 LOG=1 LOG_ALL=1 all
16 | make report_summary &> 50.out
17 | cd ..
18 | 


--------------------------------------------------------------------------------
/tests/target_task/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (ENABLE_CXX)
 2 |   foreach(NAME target_nowait_task target_taskwait taskloop_offload_nowait taskloop omp-task-bug target_nowait_taskwait target_update_nowait_taskwait)
 3 |     set(FULLNAME ${NAME})
 4 |     set(EXE_NAME cxx.${FULLNAME})
 5 |     add_executable(${EXE_NAME} ${FULLNAME}.cpp)
 6 |     set_target_properties(${EXE_NAME} PROPERTIES
 7 |                           RUNTIME_OUTPUT_DIRECTORY ${TESTS_BINDIR})
 8 |     add_test(NAME ${EXE_NAME}
 9 |              COMMAND $<TARGET_FILE:${EXE_NAME}>
10 |              WORKING_DIRECTORY ${TESTS_BINDIR})
11 |     set_tests_properties(${EXE_NAME} PROPERTIES LABELS cxx)
12 |   endforeach()
13 | endif()
14 | 


--------------------------------------------------------------------------------
/tests/target_task/omp-task-bug.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <memory>
  3 | #include <numeric>
  4 | #include <vector>
  5 | #ifdef _OPENMP
  6 | #include <omp.h>
  7 | #else
  8 | int omp_get_thread_num() { return 0; }
  9 | int omp_get_num_threads() { return 1; }
 10 | int omp_get_max_threads() { return 1; }
 11 | #endif
 12 | 
 13 | template<typename T>
 14 | struct MyProblem
 15 | {
 16 |   int M    = 16;
 17 |   int N    = 16;
 18 |   int K    = 32;
 19 |   int Size = 0;
 20 |   int IP   = 0;
 21 |   T* V     = nullptr;
 22 |   T* W     = nullptr;
 23 | 
 24 |   explicit MyProblem(int np) : Size(M * N * K / np)
 25 |   {
 26 |     M            = M / np;
 27 |     size_t bytes = Size * sizeof(T);
 28 |     auto* v_ptr  = (T*)aligned_alloc(64, bytes);
 29 |     auto* w_ptr  = (T*)aligned_alloc(64, bytes);
 30 | 
 31 | #pragma omp target enter data map(alloc : v_ptr [0:Size], w_ptr [0:Size])
 32 | 
 33 |     V = v_ptr;
 34 |     W = w_ptr;
 35 | 
 36 | #pragma omp target enter data map(to : this [0:1])
 37 |   }
 38 | 
 39 |   ~MyProblem()
 40 |   {
 41 |     auto* v_ptr = V;
 42 |     auto* w_ptr = W;
 43 | 
 44 | #pragma omp target exit data map(delete : this [0:1])
 45 | 
 46 | #pragma omp target exit data map(delete : v_ptr[:Size], w_ptr[:Size])
 47 | 
 48 |     free(W);
 49 |     free(V);
 50 |   }
 51 | 
 52 |   void setV(int ip)
 53 |   {
 54 |     IP = ip;
 55 |     std::iota(V, V + Size, T(ip * Size));
 56 |   }
 57 | 
 58 |   void update()
 59 |   {
 60 |     // v_ptr and w_ptr are shared as a task is created
 61 |     auto* v_ptr = V;
 62 |     auto* w_ptr = W;
 63 | #pragma omp target teams distribute collapse(2) map(always, to : v_ptr[:Size]) nowait depend(out : w_ptr[:Size])
 64 |     for (int i = 0; i < M; ++i)
 65 |       for (int j = 0; j < N; ++j)
 66 |       {
 67 | #pragma omp parallel for
 68 |         for (int k = 0; k < K; ++k)
 69 |         {
 70 |           int ijk    = i * N * K + j * K + k;
 71 |           w_ptr[ijk] = 0.1f + v_ptr[ijk];
 72 |         }
 73 |       }
 74 | 
 75 | #pragma omp target update nowait depend(inout : w_ptr[:Size]) from(w_ptr[:Size])
 76 | 
 77 | #if defined(INPLACE_TASKWAIT)
 78 | #pragma omp taskwait
 79 | #endif
 80 |   }
 81 | 
 82 |   void write() const
 83 |   {
 84 |     std::cout << "result: " << IP << std::endl;
 85 |     std::cout << "V[" << 0 << "] = " << V[0] << " " << W[0] << std::endl;
 86 |     std::cout << "V[" << Size / 2 << "] = " << V[Size / 2] << " " << W[Size / 2] << std::endl;
 87 |     std::cout << "V[" << Size - 1 << "] = " << V[Size - 1] << " " << W[Size - 1] << std::endl;
 88 |   }
 89 | };
 90 | 
 91 | int main(int argc, char** argv)
 92 | {
 93 |   const int np = omp_get_max_threads();
 94 | 
 95 |   std::vector<std::unique_ptr<MyProblem<float>>> problems(np * 4);
 96 | 
 97 | #pragma omp parallel
 98 |   {
 99 |     int ip = omp_get_thread_num();
100 | 
101 |     for (int iw = 0; iw < 4; iw++)
102 |     {
103 |       int I       = ip * 4 + iw;
104 |       problems[I] = std::make_unique<MyProblem<float>>(np * 4);
105 |       problems[I]->setV(I);
106 |     }
107 | 
108 |     for (int iw = 0; iw < 4; iw++)
109 |     {
110 |       int I = ip * 4 + iw;
111 |       problems[I]->update();
112 |     }
113 |   }
114 | 
115 |   /*
116 |     for(int ip=0; ip<np*4; ++ip)
117 |     {
118 |       problems[ip]->write();
119 |     }
120 |   */
121 | }
122 | 


--------------------------------------------------------------------------------
/tests/target_task/target_nowait_task.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdexcept>
 2 | #include <iostream>
 3 | 
 4 | int main()
 5 | {
 6 |   int a = 0;
 7 |   std::cout << "outside a = " << a << " addr " << &a << std::endl;
 8 |   #pragma omp target map(tofrom: a) depend(out: a) nowait
 9 |   {
10 |     int sum = 0;
11 |     for (int i = 0; i < 100000; i++)
12 |       sum++;
13 |     a = 1;
14 |   }
15 | 
16 |   #pragma omp task depend(in: a) shared(a)
17 |   {
18 |     std::cout << "a = " << a << " addr " << &a << std::endl;
19 |     if (a != 1)
20 |       throw std::runtime_error("wrong result!");
21 |   }
22 | 
23 |   #pragma omp taskwait
24 |   return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/tests/target_task/target_nowait_taskwait.cpp:
--------------------------------------------------------------------------------
 1 | //////////////////////////////////////////////////////////////////////////////////////
 2 | // This file is distributed under the University of Illinois/NCSA Open Source License.
 3 | // See LICENSE file in top directory for details.
 4 | //
 5 | // Copyright (c) 2019 QMCPACK developers.
 6 | //
 7 | // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 8 | //
 9 | // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 | //////////////////////////////////////////////////////////////////////////////////////
11 | 
12 | 
13 | #include <stdexcept>
14 | #include <vector>
15 | 
16 | const int num_sections = 1;
17 | const int section_size = 100;
18 | constexpr int array_size = num_sections * section_size;
19 | 
20 | int main(int argc, char** argv)
21 | {
22 |   //std::vector<int, OMPallocator<int>> array(array_size, 1);
23 |   std::vector<int> array(array_size, 1);
24 |   int* array_ptr = array.data();
25 |   #pragma omp target enter data map(alloc:array_ptr[:array_size])
26 | 
27 |   #pragma omp target update to(array_ptr[:array_size])
28 |   #pragma omp target teams distribute parallel for map(tofrom: array_ptr[:array_size])
29 |   for (int i = 0; i < array_size; i++)
30 |   {
31 |     array_ptr[i] += i;
32 |   }
33 | 
34 |   for (int offset = 0; offset < array_size; offset += section_size)
35 |   {
36 |     #pragma omp target update from(array_ptr[offset:section_size]) nowait
37 |   }
38 |   #pragma omp taskwait
39 |   #pragma omp target exit data map(delete:array_ptr[:array_size])
40 | 
41 |   if(array_ptr[4] != 5)
42 |     throw std::runtime_error("array_ptr[4] check failed after update!");
43 |   if(array_ptr[94] != 95)
44 |     throw std::runtime_error("array_ptr[94] check failed after update!");
45 | }
46 | 


--------------------------------------------------------------------------------
/tests/target_task/target_taskwait.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdexcept>
 2 | #include <iostream>
 3 | 
 4 | int main()
 5 | {
 6 |   int a = 0;
 7 |   std::cout << "outside a = " << a << " addr " << &a << std::endl;
 8 |   #pragma omp target nowait
 9 |   {
10 |     int sum = 0;
11 |     for (int i = 0; i < 100000; i++)
12 |       sum++;
13 |     a = 1;
14 |   }
15 | 
16 |   #pragma omp taskwait
17 |   return 0;
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/target_task/target_update_nowait_taskwait.cpp:
--------------------------------------------------------------------------------
 1 | //////////////////////////////////////////////////////////////////////////////////////
 2 | // This file is distributed under the University of Illinois/NCSA Open Source License.
 3 | // See LICENSE file in top directory for details.
 4 | //
 5 | // Copyright (c) 2023 QMCPACK developers.
 6 | //
 7 | // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 8 | //
 9 | // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 | //////////////////////////////////////////////////////////////////////////////////////
11 | 
12 | 
13 | #include <stdexcept>
14 | #include <vector>
15 | 
16 | const int num_sections = 1;
17 | const int section_size = 100;
18 | constexpr int array_size = num_sections * section_size;
19 | 
20 | int main(int argc, char** argv)
21 | {
22 |   //std::vector<int, OMPallocator<int>> array(array_size, 1);
23 |   std::vector<int> array(array_size, 1);
24 |   int* array_ptr = array.data();
25 |   #pragma omp target enter data map(alloc:array_ptr[:array_size])
26 |   for (int offset = 0; offset < array_size; offset += section_size)
27 |   {
28 |     #pragma omp target update from(array_ptr[offset:section_size]) nowait
29 |   }
30 |   #pragma omp taskwait
31 |   #pragma omp target exit data map(delete:array_ptr[:array_size])
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/target_task/taskloop.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <stdexcept>
 4 | #include <memory>
 5 | 
 6 | template<typename T>
 7 | class RefVectorWithLeader : public std::vector<std::reference_wrapper<T>>
 8 | {
 9 | public:
10 |   RefVectorWithLeader(T& leader) : leader_(leader) {}
11 | 
12 |   RefVectorWithLeader(T& leader, const std::vector<std::reference_wrapper<T>>& vec) : leader_(leader)
13 |   {
14 |     for (T& element : vec)
15 |       this->push_back(element);
16 |   }
17 | 
18 |   T& getLeader() const { return leader_; }
19 | 
20 |   T& operator[](size_t i) const { return std::vector<std::reference_wrapper<T>>::operator[](i).get(); }
21 | 
22 |   template<typename CASTTYPE>
23 |   CASTTYPE& getCastedLeader() const
24 |   {
25 |     static_assert(std::is_const<T>::value == std::is_const<CASTTYPE>::value, "Unmatched const type qualifier!");
26 | #ifndef NDEBUG
27 |     assert(dynamic_cast<CASTTYPE*>(&leader_.get()) != nullptr);
28 | #endif
29 |     return static_cast<CASTTYPE&>(leader_.get());
30 |   }
31 | 
32 |   template<typename CASTTYPE>
33 |   CASTTYPE& getCastedElement(size_t i) const
34 |   {
35 |     static_assert(std::is_const<T>::value == std::is_const<CASTTYPE>::value, "Unmatched const type qualifier!");
36 | #ifndef NDEBUG
37 |     assert(dynamic_cast<CASTTYPE*>(&(*this)[i]) != nullptr);
38 | #endif
39 |     return static_cast<CASTTYPE&>((*this)[i]);
40 |   }
41 | 
42 | private:
43 |   std::reference_wrapper<T> leader_;
44 | };
45 | 
46 | class TWF
47 | {
48 | public:
49 |   static void mw_accept_rejectMove(const RefVectorWithLeader<TWF>& wf_list)
50 |   {
51 |     auto& wf_leader = wf_list.getLeader();
52 |     const int vec_size = wf_list.size();
53 |     std::cout << "vec size outside " << vec_size << " addr " << &wf_list << std::endl;
54 |     #pragma omp taskloop default(shared) if(wf_leader.use_tasking)
55 |     for(int i=0; i<2; i++)
56 |     {
57 |       std::cout << "vec size inside " << wf_list.size() << " addr " << &wf_list << std::endl;
58 |       if (vec_size != wf_list.size())
59 |         throw std::runtime_error("mismatched size!");
60 |     }
61 |   }
62 | 
63 | private:
64 |   bool use_tasking = false;
65 | };
66 | 
67 | int main()
68 | {
69 |   std::vector<TWF> twf(2);
70 |   std::vector<std::reference_wrapper<TWF>> twf_ref;
71 |   twf_ref.push_back(twf[0]);
72 |   twf_ref.push_back(twf[1]);
73 |   RefVectorWithLeader<TWF> twf_crowd(twf[0], twf_ref);
74 |   TWF::mw_accept_rejectMove(twf_crowd);
75 |   return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/tests/target_task/taskloop_offload_nowait.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdlib>
 3 | #include <cmath>
 4 | bool almost_equal(float x, float gold, float tol) {
 5 |   if ( std::signbit(x) != std::signbit(gold) )
 6 |   {
 7 |     x = std::abs(gold) - std::abs(x);
 8 |   }
 9 |   return std::abs(gold) * (1-tol) <= std::abs(x) && std::abs(x) <= std::abs(gold) * (1 + tol);
10 | }
11 | 
12 | int main()
13 | {
14 |   const int N0 { 2 };
15 |   const int N1 { 182 };
16 |   const float expected_value { N0*N1 };
17 |   float counter_N0{};
18 |   #pragma omp target data map(tofrom: counter_N0)
19 |   {
20 |     #pragma omp taskloop shared(counter_N0)
21 |     for (int i0 = 0 ; i0 < N0 ; i0++ )
22 |     {
23 |       #pragma omp target teams distribute parallel for map(tofrom: counter_N0) nowait
24 |       for (int i1 = 0 ; i1 < N1 ; i1++ )
25 |       {
26 |         #pragma omp atomic update
27 |         counter_N0 = counter_N0 + 1. ;
28 |       }
29 |     }
30 |   }
31 | 
32 |   if (!almost_equal(counter_N0, expected_value, 0.1)) {
33 |     std::cerr << "Expected: " << expected_value << " Got: " << counter_N0 << std::endl;
34 |     std::exit(112);
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/tasks/implicit_shared.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | void foo(int& x)
 4 | {
 5 |   #pragma omp task if(0)
 6 |   {
 7 |     x++;
 8 |     std::cout << "inside " << x << std::endl;
 9 |   }
10 | }
11 | 
12 | int main()
13 | {
14 |   std::cout << "Test task in-place" << std::endl;
15 |   int x = 0;
16 |   #pragma omp parallel
17 |   {
18 |     #pragma omp single
19 |     {
20 |       #pragma omp task if(0)
21 |       {
22 |         x++;
23 |         std::cout << "inside " << x << std::endl;
24 |       }
25 |     }
26 |   }
27 |   std::cout << "outside " << x << std::endl;
28 | 
29 |   std::cout << "Test task in functon" << std::endl;
30 |   x = 0;
31 |   #pragma omp parallel
32 |   {
33 |     #pragma omp single
34 |     {
35 |       foo(x);
36 |     }
37 |   }
38 |   std::cout << "outside " << x << std::endl;
39 | }
40 | 


--------------------------------------------------------------------------------