├── .clang-format ├── .github └── workflows │ ├── clang-format.yml │ ├── ubuntu-20.04.yml │ ├── ubuntu-22.04.yml │ ├── windows-2019.yml │ └── windows-2022.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── benchmarks ├── CMakeLists.txt └── spmv │ ├── CMakeLists.txt │ ├── parameters.hxx │ └── work_oriented.cu ├── cmake ├── FetchCXXOpts.cmake ├── FetchColors.cmake ├── FetchGoogleTest.cmake ├── FetchModernGPU.cmake ├── FetchNVBench.cmake └── FetchThrustCUB.cmake ├── datasets ├── chesapeake │ └── chesapeake.mtx └── suitesparse.txt ├── docker ├── README.md └── dockerfile ├── docs ├── .gitkeep ├── _config.yml ├── abstraction.md ├── background.md └── loadbalancing_api.md ├── examples ├── CMakeLists.txt ├── range │ ├── CMakeLists.txt │ └── range.cu ├── saxpy │ ├── CMakeLists.txt │ └── saxpy.cu ├── spmm │ ├── CMakeLists.txt │ ├── helpers.hxx │ └── thread_mapped.cu └── spmv │ ├── CMakeLists.txt │ ├── group_mapped.cu │ ├── helpers.hxx │ ├── merge_path.cu │ ├── original.cu │ ├── thread_mapped.cu │ └── work_oriented.cu ├── include └── loops │ ├── algorithms │ ├── spmm │ │ └── thread_mapped.cuh │ └── spmv │ │ ├── group_mapped.cuh │ │ ├── merge_path_flat.cuh │ │ ├── original.cuh │ │ ├── thread_mapped.cuh │ │ └── work_oriented.cuh │ ├── container │ ├── container.hxx │ ├── coo.hxx │ ├── coordinate.hxx │ ├── csc.hxx │ ├── csr.hxx │ ├── detail │ │ ├── convert.hxx │ │ ├── mmio.cpp │ │ └── mmio.hxx │ ├── formats.hxx │ ├── market.hxx │ ├── matrix.cuh │ └── vector.hxx │ ├── error.hxx │ ├── memory.hxx │ ├── range.hxx │ ├── schedule.hxx │ ├── schedule │ ├── group_mapped.hxx │ ├── merge_path_flat.hxx │ ├── thread_mapped.hxx │ └── work_oriented.hxx │ ├── stride_ranges.hxx │ └── util │ ├── device.hxx │ ├── equal.hxx │ ├── filepath.hxx │ ├── generate.hxx │ ├── launch.hxx │ ├── math.hxx │ ├── sample.hxx │ ├── search.hxx │ └── timer.hxx ├── plots ├── data │ ├── cusparse.csv │ ├── group_mapped.csv │ ├── heuristics.csv │ ├── merge_path.csv │ ├── thread_mapped.csv │ └── versus_cub.csv ├── performance_evaluation.ipynb └── requirements.txt ├── scripts ├── README.md ├── format.sh └── run.sh └── unittests └── CMakeLists.txt /.clang-format: -------------------------------------------------------------------------------- 1 | # Defines the Chromium style for automatic reformatting. 2 | # http://clang.llvm.org/docs/ClangFormatStyleOptions.html 3 | BasedOnStyle: Chromium 4 | # This defaults to 'Auto'. Explicitly set it for a while, so that 5 | # 'vector >' in existing files gets formatted to 6 | # 'vector>'. ('Auto' means that clang-format will only use 7 | # 'int>>' if the file already contains at least one such instance.) 8 | Standard: Cpp11 9 | SortIncludes: false -------------------------------------------------------------------------------- /.github/workflows/clang-format.yml: -------------------------------------------------------------------------------- 1 | name: clang-format 2 | 3 | # Controls when the workflow will run 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the main branch 6 | push: 7 | branches: [ main, dev ] 8 | 9 | jobs: 10 | formatting-check: 11 | name: Format 12 | runs-on: ubuntu-22.04 13 | steps: 14 | - uses: actions/checkout@v3 15 | with: 16 | ref: ${{ github.event.pull_request.head.ref }} 17 | 18 | - name: Install clang-format 19 | run: | 20 | pip install clang-format \ 21 | && clang-format --version 22 | shell: bash 23 | 24 | - name: Format files 25 | run: ${{github.workspace}}/scripts/format.sh ${{github.workspace}} y 26 | shell: bash 27 | 28 | - name: Commit changes 29 | uses: EndBug/add-and-commit@v9 30 | with: 31 | committer_name: GitHub Actions 32 | committer_email: 41898282+github-actions[bot]@users.noreply.github.com 33 | message: ':octocat: Applied clang-format. [skip build]' 34 | add: '["*.cpp", "*.cu", "*.cuh", "*.hxx"]' -------------------------------------------------------------------------------- /.github/workflows/ubuntu-20.04.yml: -------------------------------------------------------------------------------- 1 | name: ubuntu-20.04 2 | 3 | # Controls when the workflow will run 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the main branch 6 | push: 7 | branches: [ main, dev ] 8 | pull_request: 9 | branches: [ main, dev ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | env: 15 | BUILD_TYPE: Release 16 | ARCHITECTURES: 75 17 | 18 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 19 | jobs: 20 | # This workflow contains a single job called "build" 21 | build: 22 | # https://github.blog/changelog/2021-02-08-github-actions-skip-pull-request-and-push-workflows-with-skip-ci/ 23 | if: "!contains(github.event.commits[0].message, '[skip build]')" 24 | runs-on: ubuntu-20.04 25 | 26 | # Steps represent a sequence of tasks that will be executed as part of the job 27 | steps: 28 | # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.7 29 | - name: Fetch CUDA toolkit 30 | uses: Jimver/cuda-toolkit@v0.2.7 31 | id: cuda-toolkit 32 | with: 33 | cuda: '11.7.0' 34 | linux-local-args: '["--toolkit"]' 35 | 36 | # Runs a single command using the runners shell 37 | - name: Check nvcc version 38 | run: nvcc -V 39 | 40 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 41 | - uses: actions/checkout@v3 42 | 43 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 44 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 45 | - name: Configure cmake 46 | run: cmake -B ${{github.workspace}}/build -D LOOPS_BUILD_TESTS=ON -D CMAKE_CUDA_ARCHITECTURES=${{env.ARCHITECTURES}} -D CMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 47 | 48 | # Build your program with the given configuration 49 | - name: Build all applications 50 | run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} 51 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-22.04.yml: -------------------------------------------------------------------------------- 1 | name: ubuntu-22.04 2 | 3 | # Controls when the workflow will run 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the main branch 6 | push: 7 | branches: [ main, dev ] 8 | pull_request: 9 | branches: [ main, dev ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | env: 15 | BUILD_TYPE: Release 16 | ARCHITECTURES: 75 17 | 18 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 19 | jobs: 20 | # This workflow contains a single job called "build" 21 | build: 22 | # https://github.blog/changelog/2021-02-08-github-actions-skip-pull-request-and-push-workflows-with-skip-ci/ 23 | if: "!contains(github.event.commits[0].message, '[skip build]')" 24 | runs-on: ubuntu-22.04 25 | 26 | # Steps represent a sequence of tasks that will be executed as part of the job 27 | steps: 28 | # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.7 29 | - name: Fetch CUDA toolkit 30 | uses: Jimver/cuda-toolkit@v0.2.7 31 | id: cuda-toolkit 32 | with: 33 | cuda: '11.7.0' 34 | linux-local-args: '["--toolkit"]' 35 | 36 | # Runs a single command using the runners shell 37 | - name: Check nvcc version 38 | run: nvcc -V 39 | 40 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 41 | - uses: actions/checkout@v3 42 | 43 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 44 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 45 | - name: Configure cmake 46 | run: cmake -B ${{github.workspace}}/build -D LOOPS_BUILD_TESTS=ON -D CMAKE_CUDA_ARCHITECTURES=${{env.ARCHITECTURES}} -D CMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 47 | 48 | # Build your program with the given configuration 49 | - name: Build all applications 50 | run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} 51 | -------------------------------------------------------------------------------- /.github/workflows/windows-2019.yml: -------------------------------------------------------------------------------- 1 | name: windows-2019 2 | 3 | # Controls when the workflow will run 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the main branch 6 | push: 7 | branches: [ main, dev ] 8 | pull_request: 9 | branches: [ main, dev ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | env: 15 | BUILD_TYPE: Release 16 | ARCHITECTURES: 75 17 | 18 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 19 | jobs: 20 | # This workflow contains a single job called "build" 21 | build: 22 | # https://github.blog/changelog/2021-02-08-github-actions-skip-pull-request-and-push-workflows-with-skip-ci/ 23 | if: "!contains(github.event.commits[0].message, '[skip build]')" 24 | runs-on: windows-2019 25 | 26 | # Steps represent a sequence of tasks that will be executed as part of the job 27 | steps: 28 | # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.7 29 | - name: Fetch CUDA toolkit 30 | uses: Jimver/cuda-toolkit@v0.2.7 31 | id: cuda-toolkit 32 | with: 33 | cuda: '11.7.0' 34 | linux-local-args: '["--toolkit"]' 35 | 36 | # Runs a single command using the runners shell 37 | - name: Check nvcc version 38 | run: nvcc -V 39 | 40 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 41 | - uses: actions/checkout@v3 42 | 43 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 44 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 45 | - name: Configure cmake 46 | run: cmake -B ${{github.workspace}}/build -D LOOPS_BUILD_TESTS=ON -D CMAKE_CUDA_ARCHITECTURES=${{env.ARCHITECTURES}} -D CMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 47 | 48 | # See issue: https://github.com/NVIDIA/thrust/issues/1328 49 | # And... https://github.com/gunrock/essentials/issues/92 50 | - name: Remove CUB symbolic link 51 | run: rm ${{github.workspace}}/externals/thrust-src/cub 52 | 53 | # Build your program with the given configuration 54 | - name: Build all applications 55 | run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} 56 | -------------------------------------------------------------------------------- /.github/workflows/windows-2022.yml: -------------------------------------------------------------------------------- 1 | name: windows-2022 2 | 3 | # Controls when the workflow will run 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the main branch 6 | push: 7 | branches: [ main, dev ] 8 | pull_request: 9 | branches: [ main, dev ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | env: 15 | BUILD_TYPE: Release 16 | ARCHITECTURES: 75 17 | 18 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 19 | jobs: 20 | # This workflow contains a single job called "build" 21 | build: 22 | # https://github.blog/changelog/2021-02-08-github-actions-skip-pull-request-and-push-workflows-with-skip-ci/ 23 | if: "!contains(github.event.commits[0].message, '[skip build]')" 24 | runs-on: windows-2022 25 | 26 | # Steps represent a sequence of tasks that will be executed as part of the job 27 | steps: 28 | # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.7 29 | - name: Fetch CUDA toolkit 30 | uses: Jimver/cuda-toolkit@v0.2.7 31 | id: cuda-toolkit 32 | with: 33 | cuda: '11.7.0' 34 | linux-local-args: '["--toolkit"]' 35 | 36 | # Runs a single command using the runners shell 37 | - name: Check nvcc version 38 | run: nvcc -V 39 | 40 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 41 | - uses: actions/checkout@v3 42 | 43 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 44 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 45 | - name: Configure cmake 46 | run: cmake -B ${{github.workspace}}/build -D LOOPS_BUILD_TESTS=ON -D CMAKE_CUDA_ARCHITECTURES=${{env.ARCHITECTURES}} -D CMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 47 | 48 | # See issue: https://github.com/NVIDIA/thrust/issues/1328 49 | # And... https://github.com/gunrock/essentials/issues/92 50 | - name: Remove CUB symbolic link 51 | run: rm ${{github.workspace}}/externals/thrust-src/cub 52 | 53 | # Build your program with the given configuration 54 | - name: Build all applications 55 | run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore vscode 2 | *.vscode* 3 | 4 | # Ignore vim undo file 5 | *.un~ 6 | 7 | # Compiled Object files 8 | *.slo 9 | *.lo 10 | *.o 11 | 12 | # Compiled Dynamic libraries 13 | *.so 14 | *.dylib 15 | 16 | # Compiled Static libraries 17 | *.lai 18 | *.la 19 | *.a 20 | 21 | # Ignore graph dataset files 22 | *.graph 23 | *.mtx* 24 | 25 | # Ignore tarball 26 | *.tar.gz 27 | 28 | # Ignore bin directory 29 | bin/ 30 | 31 | # Ignore eval directory 32 | eval/ 33 | 34 | # Ignore vim temp files 35 | *.sw~ 36 | 37 | # Ignore build directory 38 | build 39 | 40 | # Ignore external fetched content 41 | externals/* 42 | 43 | # Ignore checkpoints ipynb 44 | */*.ipynb_checkpoints/* 45 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #################################################### 2 | ############ INSTALLING CORRECT CMAKE ############## 3 | #################################################### 4 | # Installing correct cmake version is easy! 5 | # 1) Find the respective version here; 6 | # https://github.com/Kitware/CMake/releases, 7 | # and 2) replace the [x.xx.x] in the following 8 | # commands with the version number (remove the 9 | # brackets). For example, if you are installing 10 | # CMake 3.22.1, replace [x.xx.x] with 3.22.1: 11 | 12 | # wget https://github.com/Kitware/CMake/releases/download/v[x.xx.x]/cmake-[x.xx.x]-linux-x86_64.sh 13 | # chmod +x ./cmake-[x.xx.x]-linux-x86_64.sh 14 | # ./cmake-[x.xx.x]-linux-x86_64.sh 15 | # sudo mv cmake-[x.xx.x]-linux-x86_64 /opt/cmake 16 | # sudo ln -s /opt/cmake/bin/* /usr/local/bin/ 17 | cmake_minimum_required(VERSION 3.24 FATAL_ERROR) 18 | 19 | # Select "Release" as the default build type. 20 | # This can be altered by setting -DCMAKE_BUILD_TYPE 21 | # in the command-line interface to Release or Debug. 22 | # No reason to set CMAKE_CONFIGURATION_TYPES if it's 23 | # not a multiconfig generator. Also no reason mess 24 | # with CMAKE_BUILD_TYPE if it's a multiconfig generator. 25 | # https://stackoverflow.com/a/31548693/5729690 26 | get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) 27 | if(isMultiConfig) 28 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) 29 | else() 30 | if(NOT CMAKE_BUILD_TYPE) 31 | message(STATUS "Defaulting to Release build type") 32 | set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE) 33 | endif() 34 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY HELPSTRING "Choose the type of build") 35 | # set the valid options for cmake-gui drop-down list 36 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug;Release") 37 | endif() 38 | 39 | #################################################### 40 | ############### SET SM ARCHITECTURE ################ 41 | #################################################### 42 | 43 | ## Note: This applies to NVBench as well. 44 | ## Can be used for applications by extracting the 45 | ## CUDA_ARCHITECTURES property from loops project. 46 | ## see: get_target_properties() 47 | ## see: https://github.com/RadeonOpenCompute/rocminfo/blob/master/rocm_agent_enumerator#L12 48 | if(NOT CMAKE_CUDA_ARCHITECTURES) 49 | set(CMAKE_CUDA_ARCHITECTURES 70) 50 | message(STATUS "Using default GPU Architecture (CMAKE_CUDA_ARCHITECTURES): ${CMAKE_CUDA_ARCHITECTURES}") 51 | else() 52 | message(STATUS "GPU Architecture (CMAKE_CUDA_ARCHITECTURES): ${CMAKE_CUDA_ARCHITECTURES}") 53 | endif() 54 | 55 | project(loops 56 | LANGUAGES CXX C CUDA 57 | ) 58 | 59 | # begin /* Dependencies directory */ 60 | set(PROJECT_DEPS_DIR externals) 61 | # end /* Dependencies directory */ 62 | 63 | # begin /* Include cmake modules */ 64 | include(${PROJECT_SOURCE_DIR}/cmake/FetchColors.cmake) 65 | include(${PROJECT_SOURCE_DIR}/cmake/FetchThrustCUB.cmake) 66 | include(${PROJECT_SOURCE_DIR}/cmake/FetchCXXOpts.cmake) 67 | # end /* Include cmake modules */ 68 | 69 | ## Set the directory where the binaries will be stored 70 | set(EXECUTABLE_OUTPUT_PATH 71 | ${PROJECT_BINARY_DIR}/bin 72 | CACHE PATH 73 | "Directory where all executables will be stored") 74 | 75 | ## Set the directory where the libraries will be stored 76 | set(LIBRARY_OUTPUT_PATH 77 | ${PROJECT_BINARY_DIR}/lib 78 | CACHE PATH 79 | "Directory where all the libraries will be stored") 80 | 81 | ## Export compile commands 82 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 83 | set(CMAKE_VERBOSE_MAKEFILE OFF) 84 | 85 | ############ ADD LIBRARY: LOOPS (HEADER-ONLY) ############ 86 | add_library(loops INTERFACE) 87 | 88 | #################################################### 89 | ############### SET TARGET PROPERTIES ############## 90 | #################################################### 91 | set_target_properties(loops 92 | PROPERTIES 93 | CXX_STANDARD 17 94 | CXX_STANDARD_REQUIRED ON 95 | CXX_EXTENSIONS OFF # Should this be turned on for MSVC? 96 | CUDA_STANDARD 17 97 | CUDA_STANDARD_REQUIRED ON 98 | CUDA_EXTENSIONS OFF 99 | CUDA_RESOLVE_DEVICE_SYMBOLS ON 100 | CUDA_SEPARABLE_COMPILATION ON 101 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 102 | # CUDA_PTX_COMPILATION ON # Can only be applied to OBJ. 103 | ) 104 | 105 | #################################################### 106 | ############ TARGET COMPILER DEFINITIONS ########### 107 | #################################################### 108 | target_compile_definitions(loops 109 | INTERFACE 110 | SM_TARGET=${CMAKE_CUDA_ARCHITECTURES} 111 | LOOPS_VERSION=${LOOPS_VERSION} 112 | ) 113 | 114 | message(STATUS "Loops CUDA Architecture: ${CMAKE_CUDA_ARCHITECTURES}") 115 | 116 | #################################################### 117 | ############ TARGET COMPILE FEATURES ############### 118 | #################################################### 119 | # Turn C++ Standard 17 ON. 120 | target_compile_features(loops INTERFACE cxx_std_17) 121 | # set(CMAKE_CXX_EXTENSIONS OFF) 122 | 123 | set(LOOPS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) 124 | 125 | #################################################### 126 | ############ TARGET INCLUDE DIRECTORIES ############ 127 | #################################################### 128 | target_include_directories(loops 129 | INTERFACE ${LOOPS_INCLUDE_DIR} 130 | INTERFACE ${CXXOPTS_INCLUDE_DIR} 131 | INTERFACE ${CUB_INCLUDE_DIR} 132 | INTERFACE ${THRUST_INCLUDE_DIR} 133 | INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 134 | ) 135 | 136 | #################################################### 137 | ############ TARGET LINK LIBRARIES ################# 138 | #################################################### 139 | target_link_libraries(loops 140 | INTERFACE curand 141 | INTERFACE cuda 142 | ) 143 | 144 | #################################################### 145 | ################# TARGET SOURCES ################### 146 | #################################################### 147 | target_sources(loops 148 | INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include/loops/container/detail/mmio.cpp" 149 | ) 150 | 151 | #################################################### 152 | ############## SET CXX & CUDA FLAGS ################ 153 | #################################################### 154 | set(CXX_FLAGS 155 | $<$: 156 | /W4 157 | > 158 | $<$: 159 | -Wall 160 | # -Wextra 161 | -Wno-unused-result 162 | -Wno-unused-local-typedefs 163 | -Wno-strict-aliasing 164 | -Wno-unused-function 165 | -Wno-format-security 166 | # -Werror 167 | # -vvv 168 | > 169 | ) 170 | 171 | set(CUDA_RELEASE_FLAGS 172 | --expt-extended-lambda 173 | --expt-relaxed-constexpr 174 | --use_fast_math 175 | ) 176 | 177 | set(CUDA_DEBUG_FLAGS 178 | --expt-extended-lambda 179 | --expt-relaxed-constexpr 180 | --ptxas-options -v 181 | --debug # Host debug 182 | --device-debug # Device debug 183 | ) 184 | 185 | #################################################### 186 | ############ TARGET COMPILE OPTIONS ################ 187 | #################################################### 188 | target_compile_options(loops INTERFACE 189 | $<$,$>:${CXX_FLAGS}> 190 | $<$,$>:${CUDA_DEBUG_FLAGS}> 191 | $<$,$>:${CUDA_RELEASE_FLAGS}> 192 | ) 193 | 194 | #################################################### 195 | ############ BUILD EXAMPLE APPLICATIONS ############ 196 | #################################################### 197 | option(LOOPS_BUILD_EXAMPLES 198 | "If on, builds the example applications." 199 | ON) 200 | 201 | # Subdirectories for examples, testing and documentation 202 | if(LOOPS_BUILD_EXAMPLES) 203 | add_subdirectory(examples) 204 | endif(LOOPS_BUILD_EXAMPLES) 205 | 206 | #################################################### 207 | ################ BUILD UNIT TESTS ################# 208 | #################################################### 209 | option(LOOPS_BUILD_TESTS 210 | "If on, builds the unit tests." 211 | OFF) 212 | 213 | # Subdirectories for examples, testing and documentation 214 | if(LOOPS_BUILD_TESTS) 215 | include(${PROJECT_SOURCE_DIR}/cmake/FetchGoogleTest.cmake) 216 | # add_subdirectory(unittests) 217 | endif(LOOPS_BUILD_TESTS) 218 | 219 | #################################################### 220 | ################ BUILD BENCHMARKS ################# 221 | #################################################### 222 | option(LOOPS_BUILD_BENCHMARKS 223 | "If on, builds loops with benchmarking support." 224 | OFF) 225 | 226 | # Subdirectories for examples, testing and documentation 227 | if(LOOPS_BUILD_BENCHMARKS) 228 | # ... see https://github.com/NVIDIA/nvbench/issues/66 229 | set(NVBench_ENABLE_NVML OFF) 230 | include(${PROJECT_SOURCE_DIR}/cmake/FetchNVBench.cmake) 231 | add_subdirectory(benchmarks) 232 | endif(LOOPS_BUILD_BENCHMARKS) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Regents of the University of California 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🐧 `loops`: Expressing Parallel Irregular Computations 2 | 3 | [![ubuntu-20.04](https://github.com/gunrock/loops/actions/workflows/ubuntu-20.04.yml/badge.svg)](https://github.com/gunrock/loops/actions/workflows/ubuntu-20.04.yml) [![ubuntu-22.04](https://github.com/gunrock/loops/actions/workflows/ubuntu-22.04.yml/badge.svg)](https://github.com/gunrock/loops/actions/workflows/ubuntu-22.04.yml) [![windows-2019](https://github.com/gunrock/loops/actions/workflows/windows-2019.yml/badge.svg)](https://github.com/gunrock/loops/actions/workflows/windows-2019.yml) [![windows-2022](https://github.com/gunrock/loops/actions/workflows/windows-2022.yml/badge.svg)](https://github.com/gunrock/loops/actions/workflows/windows-2022.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7465053.svg)](https://doi.org/10.5281/zenodo.7465053) 4 | 5 | We propose an open-source GPU load-balancing framework for applications that exhibit irregular parallelism. The set of applications and algorithms we consider are fundamental to computing tasks ranging from sparse machine learning, large numerical simulations, and on through to graph analytics. The underlying data and data structures that drive these applications present access patterns that naturally don't map well to the GPU's architecture that is designed with dense and regular patterns in mind. 6 | 7 | Prior to the work we present and propose here, the only way to unleash the GPU's full power on these problems has been to workload balance through tightly coupled load-balancing techniques. Our proposed load-balancing abstraction decouples load balancing from work processing and aims to support both static and dynamic schedules with a programmable interface to implement new load-balancing schedules in the future. 8 | 9 | With our open-source framework, we hope to not only improve programmers' productivity when developing irregular-parallel algorithms on the GPU but also improve the overall performance characteristics for such applications by allowing a quick path to experimentation with a variety of existing load-balancing techniques. Consequently, we also hope that by separating the concerns of load-balancing from work processing within our abstraction, managing and extending existing code to future architectures becomes easier. 10 | 11 | ## Requirements 12 | - **OS:** Ubuntu 18.04, 20.04, Windows 13 | - **Hardware:** NVIDIA GPU (Volta or newer) 14 | - **Software:** CUDA 11.7 or above and cmake 3.20.1 or above. 15 | - **CUDA Architecture:** SM 70 or above (see [GPUs supported](https://en.wikipedia.org/wiki/CUDA#GPUs_supported)), this is specified using cmake's command: `-DCMAKE_CUDA_ARCHITECTURES=70`. Alternatively, set the CUDA architecture version in the `CMakeLists.txt` file directly: [CMakeLists.txt#72](https://github.com/gunrock/loops/blob/main/CMakeLists.txt#L72). 16 | 17 | ## Getting Started 18 | Before building `loops` make sure you have CUDA Toolkit and cmake installed on your system, and exported in `PATH` of your system. Other external dependencies such as `NVIDIA/thrust`, `NVIDIA/cub`, etc. are automatically fetched using cmake. 19 | 20 | ```bash 21 | git clone https://github.com/gunrock/loops.git 22 | cd loops 23 | mkdir build && cd build 24 | cmake -DCMAKE_CUDA_ARCHITECTURES=70 .. # Volta = 70, Turing = 75, ... 25 | make -j$(nproc) 26 | bin/loops.spmv.merge_path -m ../datasets/chesapeake/chesapeake.mtx 27 | ``` 28 | 29 | ### Building Specific Algorithms 30 | 31 | ```bash 32 | make loops.spmv. 33 | ``` 34 | Replaced the `` with one of the following algorithm names to build a specific SpMV algorithm instead of all of them: 35 | - `original` 36 | - `thread_mapped` 37 | - `group_mapped` 38 | - `work_oriented` 39 | - `merge_path` 40 | 41 | An example of the above: `make loops.spmv.merge_path`. 42 | 43 | ## Datasets 44 | 45 | To download the SuiteSparse Matrix Collection[^1], simply run the following command. We recommend using a `tmux` session, because downloading the entire collection can take a significant time. Uncompress the dataset by running the following command in the dataset's directory `find . -name '*.tar.gz' -execdir tar -xzvf '{}' \; 46 | ` The total downloaded size of the dataset is nontrivial: uncompressed + compressed = 887GB. 47 | ```bash 48 | wget --recursive --no-parent --force-directories -l inf -X RB,mat \ 49 | --accept "*.tar.gz" "https://suitesparse-collection-website.herokuapp.com/" 50 | ``` 51 | 52 | - `--recursive` recursively download 53 | - `--no-parent` prevent wget from starting to fetch links in the parent of the website 54 | - `--l inf` keep downloading for an infinite level 55 | - `-X RB,mat` ignore subdirectories RB and mat, since I am only downloading matrix market MM, you can choose to download any of the others or remove this entirely to download all formats 56 | - `--accept` accept the following extension only 57 | - `--force-directories` create a hierarchy of directories, even if one would not have been created otherwise 58 | 59 | [^1]: Timothy A. Davis and Yifan Hu. 2011. The University of Florida Sparse Matrix Collection. ACM Transactions on Mathematical Software 38, 1, Article 1 (December 2011), 25 pages. DOI: https://doi.org/10.1145/2049662.2049663 60 | 61 | ## Experimentation 62 | If CUDA and cmake are already setup, follow the [Getting Started](#getting-started) instructions. Or, you may prefer to set up the entire project using docker, and for that we have provided a docker file and instructions on how to use it in [/docker](https://github.com/gunrock/loops/tree/main/docker) directory. 63 | 64 | ### Sanity Check 65 | Run the following command in the cmake's `build` folder: 66 | ```bash 67 | bin/loops.spmv.merge_path -m ../datasets/chesapeake/chesapeake.mtx \ 68 | --validate -v 69 | ``` 70 | You should approximately see the following output: 71 | ```bash 72 | ~/loops/build$ bin/loops.spmv.merge_path \ 73 | -m ../datasets/chesapeake/chesapeake.mtx --validate -v 74 | Elapsed (ms): 0.063328 75 | Matrix: chesapeake.mtx 76 | Dimensions: 39 x 39 (340) 77 | Errors: 0 78 | ``` 79 | ## Reproducing Results 80 | > Find pre-generated results in [plots/](https://github.com/gunrock/loops/blob/main/plots/) directory along with `performance_evaluation.ipynb` notebook to recreate the plots (labeled figures) found in the paper. 81 | 82 | 1. In the run script, update the `DATASET_DIR` to point to the path of all the downloaded datasets (set to the path of the directory containing `MM` directory, and inside the `MM` it has subdirectories with `.mtx` files): [scripts/run.sh](https://github.com/gunrock/loops/blob/main/scripts/run.sh). Additionally, you may change the path to `DATASET_FILES_NAME` containing the list of all the datasets (default points to [datasets/suitesparse.txt](https://github.com/gunrock/loops/blob/main/datasets/suitesparse.txt)). 83 | 2. Fire up the complete run using `run.sh` found in `scripts` directory, `cd scripts && ./run.sh`, note one complete run can take up to 3 days (goes over the entire suitesparse matrix collection dataset four times with four different algorithms, the main bottleneck is loading files from disk.) 84 | 3. **Warning!** Some runs on the matrices are expected to fail as they are not in proper MatrixMarket Format although labeled as `.mtx`. These matrices and the ones that do not fit on the GPU will result in runtime exceptions or `offset_t` type overflow and can be safely ignored. 85 | 4. To run *N* number of datasets simply adjust the stop condition here (default set to `10`): [scripts/run.sh#L22](https://github.com/gunrock/loops/blob/main/scripts/run.sh#L22), or remove this if-condition entirely to run on all available `.mtx` files: [scripts/run.sh#L22-L26](https://github.com/gunrock/loops/blob/main/scripts/run.sh#L22-L26). 86 | 87 | Expected output from the above runs are `csv` files in the same directory as the `run.sh`, these can replace the existing `csv` files within `plots/data`, and a [python jupyter notebook](https://jupyter.org/install) can be fired up to evaluate the results. Python notebook includes instructions on generating plots. See sample output of one of the `csv` files below: 88 | 89 | ```csv 90 | kernel,dataset,rows,cols,nnzs,elapsed 91 | merge-path,144,144649,144649,2148786,0.0720215 92 | merge-path,08blocks,300,300,592,0.0170898 93 | merge-path,1138_bus,1138,1138,4054,0.0200195 94 | ``` 95 | 96 | ## How to Cite Loops 97 | Thank you for citing our work. 98 | 99 | ```bibtex 100 | @inproceedings{Osama:2023:APM, 101 | author = {Muhammad Osama and Serban D. Porumbescu and John D. Owens}, 102 | title = {A Programming Model for {GPU} Load Balancing}, 103 | booktitle = {Proceedings of the 28th ACM SIGPLAN Symposium on 104 | Principles and Practice of Parallel Programming}, 105 | series = {PPoPP 2023}, 106 | year = 2023, 107 | month = feb # "\slash " # mar, 108 | acceptance = {31 of 131 submissions, 23.7\%}, 109 | code = {https://github.com/gunrock/loops}, 110 | doi = {10.1145/3572848.3577434}, 111 | } 112 | ``` 113 | 114 | ```bibtex 115 | @software{Osama:2022:LAP:Code, 116 | author = {Muhammad Osama and Serban D. Porumbescu and John D. Owens}, 117 | title = {Loops: A Programming Model for GPU Load Balancing}, 118 | month = dec, 119 | year = 2022, 120 | publisher = {Zenodo}, 121 | version = {v0.1.0-alpha}, 122 | doi = {10.5281/zenodo.7465053}, 123 | url = {https://doi.org/10.5281/zenodo.7465053} 124 | } 125 | ``` 126 | -------------------------------------------------------------------------------- /benchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # begin /* Add benchmarks' subdirectories */ 2 | add_subdirectory(spmv) 3 | # end /* Add benchmarks' subdirectories */ -------------------------------------------------------------------------------- /benchmarks/spmv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(BENCHMARK_SOURCES 2 | work_oriented.cu 3 | ) 4 | 5 | foreach(SOURCE IN LISTS BENCHMARK_SOURCES) 6 | get_filename_component(BENCHMARK_NAME ${SOURCE} NAME_WLE) 7 | string(PREPEND BENCHMARK_NAME "loops.bench.spmv.") 8 | add_executable(${BENCHMARK_NAME} ${SOURCE}) 9 | target_link_libraries(${BENCHMARK_NAME} 10 | PRIVATE loops 11 | # PRIVATE nvbench::main 12 | PRIVATE nvbench::nvbench 13 | ) 14 | 15 | set_target_properties(${BENCHMARK_NAME} 16 | PROPERTIES 17 | CXX_STANDARD 17 18 | CUDA_STANDARD 17 19 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 20 | ) 21 | 22 | message(STATUS "Benchmark Added: ${BENCHMARK_NAME}") 23 | endforeach() -------------------------------------------------------------------------------- /benchmarks/spmv/parameters.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file parameters.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief SpMV + NVBench parameters. 5 | * @version 0.1 6 | * @date 2022-07-18 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | std::string filename; 22 | 23 | struct parameters_t { 24 | /** 25 | * @brief Construct a new parameters object and parse command line arguments. 26 | * 27 | * @param argc Number of command line arguments. 28 | * @param argv Command line arguments. 29 | */ 30 | parameters_t(int argc, char** argv) 31 | : m_options(argv[0], "SPMV Benchmarking"), m_argc(argc) { 32 | m_options.allow_unrecognised_options(); 33 | // Add command line options 34 | m_options.add_options()("h,help", "Print help") // help 35 | ("m,market", "Matrix file", 36 | cxxopts::value()); // mtx 37 | 38 | // Parse command line arguments. 39 | auto result = m_options.parse(argc, argv); 40 | 41 | // Print help if requested 42 | if (result.count("help")) { 43 | m_help = true; 44 | std::cout << m_options.help({""}); 45 | std::cout << " [optional nvbench args]" << std::endl << std::endl; 46 | const char* argh[1] = {"-h"}; 47 | NVBENCH_MAIN_BODY(1, argh); 48 | } 49 | 50 | // Get matrix market file or error if not specified. 51 | else { 52 | if (result.count("market") == 1) { 53 | this->m_filename = result["market"].as(); 54 | filename = m_filename; 55 | if (!loops::is_market(m_filename)) { 56 | std::cout << m_options.help({""}); 57 | std::cout << " [optional nvbench args]" << std::endl << std::endl; 58 | std::exit(0); 59 | } 60 | 61 | // Remove loops parameters and pass the rest to nvbench. 62 | for (int i = 0; i < argc; i++) { 63 | if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) { 64 | i++; 65 | continue; 66 | } 67 | m_args.push_back(argv[i]); 68 | } 69 | 70 | } else { 71 | std::cout << m_options.help({""}); 72 | std::cout << " [optional nvbench args]" << std::endl << std::endl; 73 | std::exit(0); 74 | } 75 | } 76 | } 77 | 78 | /// Helpers for NVBENCH_MAIN_BODY call. 79 | int nvbench_argc() { return m_argc - 2; } 80 | auto nvbench_argv() { return m_args.data(); } 81 | 82 | private: 83 | std::string m_filename; ///< Matrix market file. 84 | cxxopts::Options m_options; ///< Command line options. 85 | std::vector m_args; ///< Command line arguments. 86 | bool m_help = false; ///< Help flag. 87 | int m_argc; ///< Number of command line arguments. 88 | }; -------------------------------------------------------------------------------- /benchmarks/spmv/work_oriented.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file work_oriented.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Benchmark for Sparse Matrix-Vector Multiplication. 5 | * @version 0.1 6 | * @date 2022-07-18 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "parameters.hxx" 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #define LOOPS_CUPTI_SUPPORTED 0 19 | 20 | using namespace loops; 21 | 22 | template 23 | void work_oriented_bench(nvbench::state& state, nvbench::type_list) { 24 | using index_t = int; 25 | using offset_t = int; 26 | using type_t = value_t; 27 | 28 | matrix_market_t mtx; 29 | csr_t csr(mtx.load(filename)); 30 | 31 | vector_t x(csr.rows); 32 | vector_t y(csr.rows); 33 | 34 | generate::random::uniform_distribution(x.begin(), x.end(), type_t(1.0), 35 | type_t(10.0)); 36 | 37 | #if LOOPS_CUPTI_SUPPORTED 38 | /// Add CUPTI metrics to collect for the state. 39 | state.collect_dram_throughput(); 40 | state.collect_l1_hit_rates(); 41 | state.collect_l2_hit_rates(); 42 | state.collect_loads_efficiency(); 43 | state.collect_stores_efficiency(); 44 | #endif 45 | 46 | /// Execute the benchmark. 47 | state.exec(nvbench::exec_tag::sync, [&csr, &x, &y](nvbench::launch& launch) { 48 | algorithms::spmv::work_oriented(csr, x, y); 49 | }); 50 | } 51 | 52 | // Define a type_list to use for the type axis: 53 | using value_types = nvbench::type_list; 54 | NVBENCH_BENCH_TYPES(work_oriented_bench, NVBENCH_TYPE_AXES(value_types)); 55 | 56 | int main(int argc, char** argv) { 57 | parameters_t params(argc, argv); 58 | NVBENCH_MAIN_BODY(params.nvbench_argc(), params.nvbench_argv()); 59 | } -------------------------------------------------------------------------------- /cmake/FetchCXXOpts.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | set(FETCHCONTENT_QUIET ON) 3 | 4 | message(STATUS "Cloning External Project: CXXOPTS") 5 | get_filename_component(FC_BASE "../externals" 6 | REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 7 | set(FETCHCONTENT_BASE_DIR ${FC_BASE}) 8 | 9 | FetchContent_Declare( 10 | cxxopts 11 | GIT_REPOSITORY https://github.com/jarro2783/cxxopts.git 12 | GIT_TAG v3.0.0 13 | ) 14 | 15 | FetchContent_GetProperties(cxxopts) 16 | if(NOT cxxopts_POPULATED) 17 | FetchContent_Populate( 18 | cxxopts 19 | ) 20 | endif() 21 | set(CXXOPTS_INCLUDE_DIR "${cxxopts_SOURCE_DIR}/include") -------------------------------------------------------------------------------- /cmake/FetchColors.cmake: -------------------------------------------------------------------------------- 1 | if(NOT WIN32) 2 | string(ASCII 27 Esc) 3 | set(ColourReset "${Esc}[m") 4 | set(ColourBold "${Esc}[1m") 5 | set(Red "${Esc}[31m") 6 | set(Green "${Esc}[32m") 7 | set(Yellow "${Esc}[33m") 8 | set(Blue "${Esc}[34m") 9 | set(Magenta "${Esc}[35m") 10 | set(Cyan "${Esc}[36m") 11 | set(White "${Esc}[37m") 12 | set(BoldRed "${Esc}[1;31m") 13 | set(BoldGreen "${Esc}[1;32m") 14 | set(BoldYellow "${Esc}[1;33m") 15 | set(BoldBlue "${Esc}[1;34m") 16 | set(BoldMagenta "${Esc}[1;35m") 17 | set(BoldCyan "${Esc}[1;36m") 18 | set(BoldWhite "${Esc}[1;37m") 19 | endif() -------------------------------------------------------------------------------- /cmake/FetchGoogleTest.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | set(FETCHCONTENT_QUIET ON) 3 | 4 | message(STATUS "Cloning External Project: GoogleTests") 5 | get_filename_component(FC_BASE "../externals" 6 | REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 7 | set(FETCHCONTENT_BASE_DIR ${FC_BASE}) 8 | 9 | FetchContent_Declare( 10 | googletest 11 | GIT_REPOSITORY https://github.com/google/googletest.git 12 | GIT_TAG release-1.11.0 13 | ) 14 | 15 | # For Windows: Prevent overriding the parent project's compiler/linker settings 16 | if (MSVC) 17 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 18 | endif() 19 | 20 | FetchContent_MakeAvailable(googletest) 21 | add_library(gtest::main ALIAS gtest_main) 22 | 23 | -------------------------------------------------------------------------------- /cmake/FetchModernGPU.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | set(FETCHCONTENT_QUIET ON) 3 | 4 | message(STATUS "Cloning External Project: ModernGPU") 5 | get_filename_component(FC_BASE "../externals" 6 | REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 7 | set(FETCHCONTENT_BASE_DIR ${FC_BASE}) 8 | 9 | FetchContent_Declare( 10 | moderngpu 11 | GIT_REPOSITORY https://github.com/moderngpu/moderngpu.git 12 | GIT_TAG master 13 | ) 14 | 15 | FetchContent_GetProperties(moderngpu) 16 | if(NOT moderngpu_POPULATED) 17 | FetchContent_Populate( 18 | moderngpu 19 | ) 20 | endif() 21 | set(MODERNGPU_INCLUDE_DIR "${moderngpu_SOURCE_DIR}/src") 22 | -------------------------------------------------------------------------------- /cmake/FetchNVBench.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | set(FETCHCONTENT_QUIET ON) 3 | 4 | message(STATUS "Cloning External Project: NVBench") 5 | get_filename_component(FC_BASE "../externals" 6 | REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 7 | set(FETCHCONTENT_BASE_DIR ${FC_BASE}) 8 | 9 | FetchContent_Declare( 10 | nvbench 11 | GIT_REPOSITORY https://github.com/NVIDIA/nvbench.git 12 | GIT_TAG main 13 | ) 14 | 15 | FetchContent_GetProperties(nvbench) 16 | if(NOT nvbench_POPULATED) 17 | FetchContent_Populate( 18 | nvbench 19 | ) 20 | endif() 21 | 22 | # Exposing nvbench's source and include directory 23 | set(NVBENCH_INCLUDE_DIR "${nvbench_SOURCE_DIR}") 24 | set(NVBENCH_BUILD_DIR "${nvbench_BINARY_DIR}") 25 | 26 | # Add subdirectory ::nvbench 27 | add_subdirectory(${NVBENCH_INCLUDE_DIR} ${NVBENCH_BUILD_DIR}) -------------------------------------------------------------------------------- /cmake/FetchThrustCUB.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | set(FETCHCONTENT_QUIET ON) 3 | 4 | message(STATUS "Cloning External Project: Thrust and CUB") 5 | get_filename_component(FC_BASE "../externals" 6 | REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 7 | set(FETCHCONTENT_BASE_DIR ${FC_BASE}) 8 | 9 | FetchContent_Declare( 10 | thrust 11 | GIT_REPOSITORY https://github.com/thrust/thrust.git 12 | GIT_TAG 1.17.2 13 | ) 14 | 15 | FetchContent_GetProperties(thrust) 16 | if(NOT thrust_POPULATED) 17 | FetchContent_Populate( 18 | thrust 19 | ) 20 | endif() 21 | set(THRUST_INCLUDE_DIR "${thrust_SOURCE_DIR}") 22 | # Windows doesn't support symblink, so make sure we link to the real library. 23 | set(CUB_INCLUDE_DIR "${thrust_SOURCE_DIR}/dependencies/cub") -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker Instructions 2 | ## Using the provided `dockerfile`, simply run the following commands. 3 | ```bash 4 | docker build . -t loops 5 | docker run -it loops:latest /bin/bash 6 | ``` 7 | 8 | ## Alternatively pull directly from [hub.docker.com](https://hub.docker.com/repository/docker/neoblizz/loops). 9 | 10 | ```bash 11 | docker pull neoblizz/loops:v0.1 12 | ``` 13 | 14 | ## Once within the docker... 15 | ```bash 16 | cd loops/build 17 | bin/loops.spmv.merge_path -m ../datasets/chesapeake/chesapeake.mtx 18 | ``` -------------------------------------------------------------------------------- /docker/dockerfile: -------------------------------------------------------------------------------- 1 | # Or directly get from docker: 2 | # docker pull neoblizz/loops:v0.1 3 | FROM nvidia/cuda:11.7.0-devel-ubuntu20.04 4 | 5 | # Install dependencies 6 | RUN apt-get update 7 | RUN apt-get install -y build-essential 8 | RUN apt-get install -y wget vim git gcc g++ 9 | 10 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-linux-x86_64.sh 11 | RUN chmod +x ./cmake-3.25.0-linux-x86_64.sh 12 | RUN mkdir /opt/cmake-3.25.0/ 13 | RUN ./cmake-3.25.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake-3.25.0 14 | RUN ln -s /opt/cmake-3.25.0/bin/* /usr/local/bin/ 15 | 16 | 17 | # Build and install 18 | RUN git clone https://github.com/gunrock/loops.git 19 | RUN cd loops && mkdir build && cd build && cmake .. && make -j$(nproc) -------------------------------------------------------------------------------- /docs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunrock/loops/6169cf64d06e17b24b7a687fe0baf7ba2347002b/docs/.gitkeep -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /docs/abstraction.md: -------------------------------------------------------------------------------- 1 | [README](/README.md) > **Abstraction** 2 | 3 | # Abstraction 4 | 5 | The simple idea behind our load-balancing abstraction is to represent sparse formats as atoms, tiles and set functional abstraction elements described in the "Function and Set Notation" below. Once represented as such, we can develop load-balancing algorithms that create balanced ranges of atoms and tiles and map them to processor ids. This information can be abstracted to the user with a simple API (such as ranged-for-loops) to capture user-defined computations. Some benefits of this approach are: (1) the user-defined computation remains largely the same for many different static or dynamic load-balancing schedules, (2) these schedules can now be extended to other computations and (3) dramatically reduces code complexity. 6 | 7 | ## As function and set notation. 8 | 9 | Given a sparse-irregular problem $S$ made of many subsets called tiles, $T$. $T_i$ is defined as a collection of atoms, where an atom is the smallest possible processing element (for example, a nonzero element within a sparse-matrix). Using a scheduler, our abstraction's goal is to create a new set, $M$, which maps the processor ids (thread ids for a given kernel execution) $P_{id}$ to a group of subsets of $T$: $M = \{ P_{id}, T_i ... T_j \}$, map of processor ids to tiles, and the scheduler responsible for creating the maps: $L(S) = \{ M_0, ..., M_m\}$. 10 | 11 | ## As three domains: data, schedule and computation. 12 | 13 | ![illustration](https://user-images.githubusercontent.com/9790745/168728299-6b125b44-894a-49bb-92fd-ee85aaa80ae4.png) -------------------------------------------------------------------------------- /docs/background.md: -------------------------------------------------------------------------------- 1 | [README](/README.md) > **Background** 2 | 3 | # Background 4 | **DARPA** announced [**Software Defined Hardware (SDH)**](https://www.darpa.mil/program/software-defined-hardware)[^2], a program that aims "*to build runtime-reconfigurable hardware and software that enables near ASIC performance without sacrificing programmability for data-intensive algorithms.*" **NVIDIA** leading the charge on the program, internally called, [**Symphony**](https://blogs.nvidia.com/blog/2018/07/24/darpa-research-post-moores-law/). Our work is a small but important piece of this larger puzzle. The "data-intensive algorithms" part of the program includes domains like Machine Learning, Graph Processing, Sparse-Matrix-Vector algorithms, etc. where there is a large amount of data available to be processed. And the problems being addressed are either already based on irregular data structures and workloads, or are trending towards it (such as sparse machine learning problems). For these irregular workload computations to be successful, we require efficient load-balancing schemes targetting specialized hardware such as the GPUs or Symphony. 5 | 6 | [^2]: [DARPA Selects Teams to Unleash Power of Specialized, Reconfigurable Computing Hardware](https://www.darpa.mil/news-events/2018-07-24a) 7 | 8 | ## A small (and important) piece of a larger puzzle. 9 | The predominant approach today to addressing irregularity is to build application-dependent solutions. These are not portable between applications. This is a shame because We believe the underlying techniques that are currently used to address irregularity have the potential to be expressed in a generic, portable, powerful way. We build a generic open-source library for load balancing that will expose high-performance, intuitive load-balancing strategies to any irregular-parallel application. 10 | 11 | ## Load-balancing problem, and a silver lining. 12 | Today's GPUs follow a Single Instruction Multiple Data (SIMD) model, where different work components (for example a node in a graph) are mapped to a single thread. Each thread runs a copy of the program and threads run in parallel (this is a simple explanation, there are other work units in NVIDIA's GPUs such as warps, cooperative thread arrays, streaming multiprocessors etc.). Let's take a graph problem as an example to understand load imbalance. One key operation in graph problems is traversal, given a set of vertices, a traversal operation visits all the neighboring vertices of the input. If we naïvely map each input vertex to a GPU thread it can result in a massive imbalance of work. As some threads within the GPU will get a lot more work than others, causing inefficient utilization of hardware resources. In our example; this could happen for a social-network graph where one input vertex may have millions of connections while other input vertices in the same traversal pass may only have tens of neighbors. 13 | 14 | The silver lining here is that there are more intelligent workload mappings that address this problem the load imbalance problem for various types of graphs and other irregular workloads. We extend these previously tightly-coupled scheduling algorithms to an abstraction. 15 | -------------------------------------------------------------------------------- /docs/loadbalancing_api.md: -------------------------------------------------------------------------------- 1 | [README](/README.md) > **Load-Balancing API** 2 | 3 | # Load-Balancing API 4 | 5 | SpMV problem-specific kernel parameters. 6 | 7 | ```cpp 8 | template 12 | __global__ void __launch_bounds__(threads_per_block, 2) 13 | spmv(std::size_t rows, 14 | std::size_t cols, 15 | std::size_t nnz, 16 | offset_t* offsets, 17 | index_t* indices, 18 | const type_t* values, 19 | const type_t* x, 20 | type_t* y) { 21 | ``` 22 | 23 | ### (1) Define and configure load-balancing schedule. 24 | Allocates any temporary memory required for load-balancing, as well as constructs a schedule per processors partition (defined using cooperative groups). 25 | ```cpp 26 | using setup_t = schedule::setup; 28 | 29 | /// Allocate temporary storage for the schedule. 30 | using storage_t = typename setup_t::storage_t; 31 | __shared__ storage_t temporary_storage; 32 | 33 | /// Construct the schedule. 34 | setup_t config(temporary_storage, offsets, rows, nnz); 35 | auto p = config.partition(); 36 | ``` 37 | 38 | ### (2) Load-balanced ranged loops. (also see; [C++ ranges](https://en.cppreference.com/w/cpp/header/ranges)) 39 | In this example, we define two iteration spaces; virtual and real. Virtual spaces allow us to balance atoms and tiles onto the processor ids and link directly to the real iteration space, which returns the exact atom or tile being processed. The code below loops over all balanced number of atoms fetches the tile corresponding to the atom being processed and allows user to define their computation. 40 | ```cpp 41 | for (auto virtual_atom : config.atom_accessor(p)) { 42 | auto virtual_tile = config.tile_accessor(virtual_atom, p); 43 | 44 | if (!(config.is_valid_accessor(virtual_tile, p))) 45 | continue; 46 | 47 | auto row = config.tile_id(virtual_tile, p); 48 | 49 | auto nz_idx = config.atom_id(virtual_atom, row, virtual_tile, p); 50 | ``` 51 | 52 | ### (3) User-defined computation. 53 | Once the user has access to the atom, tile, and the processor id, they implement the desired computation on the given tuple. In this example, we use a simple `atomicAdd` to perform SpMV (can be improved). 54 | ```cpp 55 | atomicAdd(&(y[row]), values[nz_idx] * x[indices[nz_idx]]); 56 | } 57 | } 58 | ``` 59 | 60 | [**work_oriented.cuh**](https://github.com/neoblizz/loops/blob/main/include/loops/algorithms/spmv/work_oriented.cuh) (another example) 61 | 62 | ```cpp 63 | #include 64 | 65 | template 69 | __global__ void __launch_bounds__(threads_per_block, 2) 70 | __work_oriented(std::size_t rows, 71 | std::size_t cols, 72 | std::size_t nnz, 73 | offset_t* offsets, 74 | index_t* indices, 75 | const type_t* values, 76 | const type_t* x, 77 | type_t* y) { 78 | using setup_t = 79 | schedule::setup; 81 | 82 | setup_t config(offsets, rows, nnz); 83 | auto map = config.init(); 84 | 85 | /// Accumulate the complete tiles. 86 | type_t sum = 0; 87 | for (auto row : config.tiles(map)) { 88 | for (auto nz : config.atoms(row, map)) { 89 | sum += values[nz] * x[indices[nz]]; 90 | } 91 | y[row] = sum; 92 | sum = 0; 93 | } 94 | 95 | /// Process remaining tiles. 96 | for (auto row : config.remainder_tiles(map)) { 97 | for (auto nz : config.remainder_atoms(map)) { 98 | sum += values[nz] * x[indices[nz]]; 99 | } 100 | /// Accumulate the remainder. 101 | if (sum != 0) 102 | atomicAdd(&(y[row]), sum); 103 | } 104 | } 105 | ``` -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # begin /* Add examples' subdirectories */ 2 | add_subdirectory(range) 3 | add_subdirectory(saxpy) 4 | add_subdirectory(spmv) 5 | add_subdirectory(spmm) 6 | # end /* Add examples' subdirectories */ -------------------------------------------------------------------------------- /examples/range/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # begin /* Set the application name. */ 2 | set(APPLICATION_NAME range) 3 | # end /* Set the application name. */ 4 | 5 | # begin /* Add CUDA executables */ 6 | add_executable(${APPLICATION_NAME}) 7 | 8 | set(SOURCE_LIST 9 | ${APPLICATION_NAME}.cu 10 | ) 11 | 12 | target_sources(${APPLICATION_NAME} PRIVATE ${SOURCE_LIST}) 13 | target_link_libraries(${APPLICATION_NAME} 14 | PRIVATE loops 15 | # PRIVATE nvToolsExt 16 | ) 17 | 18 | set_target_properties(${APPLICATION_NAME} 19 | PROPERTIES 20 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 21 | ) # XXX: Find a better way to inherit loops properties. 22 | 23 | message(STATUS "Example Added: ${APPLICATION_NAME}") 24 | # end /* Add CUDA executables */ -------------------------------------------------------------------------------- /examples/range/range.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | using namespace loops; 7 | 8 | int main() { 9 | for (auto i : range(1, 5)) 10 | std::cout << i << std::endl; 11 | 12 | for (auto u : range(0u)) 13 | if (u == 3u) 14 | break; 15 | else 16 | std::cout << u << std::endl; 17 | 18 | for (auto c : range('a', 'd')) 19 | std::cout << c << std::endl; 20 | 21 | for (auto u : range(20u, 29u).step(2u)) 22 | std::cout << u << std::endl; 23 | 24 | for (auto i : range(100).step(-3)) 25 | if (i < 90) 26 | break; 27 | else 28 | std::cout << i << std::endl; 29 | 30 | std::vector x{1, 2, 3}; 31 | for (auto i : indices(x)) 32 | std::cout << i << std::endl; 33 | 34 | for (auto i : indices({"foo", "bar"})) 35 | std::cout << i << std::endl; 36 | 37 | for (auto i : indices("foobar").step(2)) 38 | std::cout << i << std::endl; 39 | } -------------------------------------------------------------------------------- /examples/saxpy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # begin /* Set the application name. */ 2 | set(APPLICATION_NAME saxpy) 3 | # end /* Set the application name. */ 4 | 5 | # begin /* Add CUDA executables */ 6 | add_executable(${APPLICATION_NAME}) 7 | 8 | set(SOURCE_LIST 9 | ${APPLICATION_NAME}.cu 10 | ) 11 | 12 | target_sources(${APPLICATION_NAME} PRIVATE ${SOURCE_LIST}) 13 | target_link_libraries(${APPLICATION_NAME} 14 | PRIVATE loops 15 | # PRIVATE nvToolsExt 16 | ) 17 | 18 | set_target_properties(${APPLICATION_NAME} 19 | PROPERTIES 20 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 21 | ) # XXX: Find a better way to inherit loops properties. 22 | 23 | message(STATUS "Example Added: ${APPLICATION_NAME}") 24 | # end /* Add CUDA executables */ -------------------------------------------------------------------------------- /examples/saxpy/saxpy.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file saxpy.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Simple CUDA example of saxpy (Single-Precision A·X Plus Y) using 5 | * ranged loops. 6 | * @version 0.1 7 | * @date 2022-02-02 8 | * 9 | * @copyright Copyright (c) 2022 10 | * 11 | */ 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | template 19 | __global__ void saxpy(int n, type_t a, type_t* x, type_t* y) { 20 | /// Equivalent to: 21 | /// i = blockIdx.x * blockDim.x + threadIdx.x; (init) 22 | /// i < n; (boundary condition) 23 | /// i += gridDim.x * blockDim.x. (step) 24 | for (auto i : loops::grid_stride_range(0, n)) { 25 | y[i] += a * x[i]; 26 | } 27 | } 28 | 29 | int main() { 30 | using type_t = float; 31 | constexpr int N = 1 << 20; 32 | constexpr type_t alpha = 2.0f; 33 | 34 | // Create thrust device vectors. 35 | loops::vector_t x(N); 36 | loops::vector_t y(N); 37 | 38 | // Generate random numbers between [0, 1]. 39 | loops::generate::random::uniform_distribution(x.begin(), x.end(), 0, 1); 40 | loops::generate::random::uniform_distribution(y.begin(), y.end(), 0, 1); 41 | 42 | // Launch kernel with a given configuration. 43 | constexpr std::size_t threads_per_block = 256; 44 | std::size_t blocks_per_grid = (N + threads_per_block - 1) / threads_per_block; 45 | saxpy<<>>(N, alpha, x.data().get(), 46 | y.data().get()); 47 | 48 | // Print the x and y vectors. 49 | std::cout << "x = "; 50 | thrust::copy(x.begin(), (x.size() < 10) ? x.end() : x.begin() + 10, 51 | std::ostream_iterator(std::cout, " ")); 52 | std::cout << std::endl; 53 | 54 | std::cout << "y = "; 55 | thrust::copy(y.begin(), (y.size() < 10) ? y.end() : y.begin() + 10, 56 | std::ostream_iterator(std::cout, " ")); 57 | std::cout << std::endl; 58 | } -------------------------------------------------------------------------------- /examples/spmm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # begin /* Add application */ 2 | set(SOURCES 3 | thread_mapped.cu 4 | ) 5 | 6 | foreach(SOURCE IN LISTS SOURCES) 7 | get_filename_component(TEST_NAME ${SOURCE} NAME_WLE) 8 | add_executable(loops.spmm.${TEST_NAME} ${SOURCE}) 9 | target_link_libraries(loops.spmm.${TEST_NAME} PRIVATE loops) 10 | set_target_properties(loops.spmm.${TEST_NAME} 11 | PROPERTIES 12 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 13 | ) 14 | message(STATUS "Example Added: loops.spmm.${TEST_NAME}") 15 | endforeach() 16 | # end /* Add application */ -------------------------------------------------------------------------------- /examples/spmm/helpers.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file helpers.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Header file for SpMM. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | 28 | struct parameters_t { 29 | std::string filename; 30 | bool validate; 31 | bool verbose; 32 | cxxopts::Options options; 33 | 34 | /** 35 | * @brief Construct a new parameters object and parse command line arguments. 36 | * 37 | * @param argc Number of command line arguments. 38 | * @param argv Command line arguments. 39 | */ 40 | parameters_t(int argc, char** argv) 41 | : options(argv[0], "Sparse Matrix-Matrix Multiplication") { 42 | // Add command line options 43 | options.add_options()("h,help", "Print help") // help 44 | ("m,market", "Matrix file", cxxopts::value()) // mtx 45 | ("validate", "CPU validation") // validate 46 | ("v,verbose", "Verbose output"); // verbose 47 | 48 | // Parse command line arguments 49 | auto result = options.parse(argc, argv); 50 | 51 | if (result.count("help") || (result.count("market") == 0)) { 52 | std::cout << options.help({""}) << std::endl; 53 | std::exit(0); 54 | } 55 | 56 | if (result.count("market") == 1) { 57 | filename = result["market"].as(); 58 | if (loops::is_market(filename)) { 59 | } else { 60 | std::cout << options.help({""}) << std::endl; 61 | std::exit(0); 62 | } 63 | } else { 64 | std::cout << options.help({""}) << std::endl; 65 | std::exit(0); 66 | } 67 | 68 | if (result.count("validate") == 1) { 69 | validate = true; 70 | } else { 71 | validate = false; 72 | } 73 | 74 | if (result.count("verbose") == 1) { 75 | verbose = true; 76 | } else { 77 | verbose = false; 78 | } 79 | } 80 | }; 81 | -------------------------------------------------------------------------------- /examples/spmm/thread_mapped.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file thread_mapped.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Matrix Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "helpers.hxx" 13 | #include 14 | 15 | using namespace loops; 16 | 17 | int main(int argc, char** argv) { 18 | using index_t = int; 19 | using offset_t = int; 20 | using type_t = float; 21 | 22 | // ... I/O parameters, mtx, etc. 23 | parameters_t parameters(argc, argv); 24 | 25 | matrix_market_t mtx; 26 | csr_t csr(mtx.load(parameters.filename)); 27 | 28 | // Input and output matrices. 29 | std::size_t n = 10; 30 | matrix_t B(csr.cols, n); 31 | matrix_t C(csr.rows, n); 32 | 33 | // Generate random numbers between [0, 10]. 34 | generate::random::uniform_distribution(B.m_data.begin(), B.m_data.end(), 1, 35 | 10); 36 | 37 | // Run the benchmark. 38 | util::timer_t timer; 39 | timer.start(); 40 | algorithms::spmm::thread_mapped(csr, B, C); 41 | timer.stop(); 42 | 43 | std::cout << "Elapsed (ms):\t" << timer.milliseconds() << std::endl; 44 | } -------------------------------------------------------------------------------- /examples/spmv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # begin /* Add application */ 2 | set(SOURCES 3 | original.cu 4 | thread_mapped.cu 5 | group_mapped.cu 6 | work_oriented.cu 7 | merge_path.cu 8 | ) 9 | 10 | foreach(SOURCE IN LISTS SOURCES) 11 | get_filename_component(TEST_NAME ${SOURCE} NAME_WLE) 12 | string(PREPEND TEST_NAME "loops.spmv.") 13 | add_executable(${TEST_NAME} ${SOURCE}) 14 | target_link_libraries(${TEST_NAME} 15 | PRIVATE loops 16 | ) 17 | 18 | set_target_properties(${TEST_NAME} 19 | PROPERTIES 20 | CXX_STANDARD 17 21 | CUDA_STANDARD 17 22 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 23 | ) 24 | 25 | message(STATUS "Example Added: ${TEST_NAME}") 26 | endforeach() 27 | # end /* Add application */ -------------------------------------------------------------------------------- /examples/spmv/group_mapped.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file group_mapped.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "helpers.hxx" 13 | #include 14 | 15 | using namespace loops; 16 | 17 | int main(int argc, char** argv) { 18 | using index_t = int; 19 | using offset_t = int; 20 | using type_t = float; 21 | 22 | // ... I/O parameters, mtx, etc. 23 | parameters_t parameters(argc, argv); 24 | matrix_market_t mtx; 25 | csr_t csr(mtx.load(parameters.filename)); 26 | 27 | // Input and output vectors. 28 | vector_t x(csr.cols); 29 | vector_t y(csr.rows); 30 | 31 | // Generate random numbers between [0, 1]. 32 | generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); 33 | 34 | // Run the benchmark. 35 | util::timer_t timer; 36 | timer.start(); 37 | algorithms::spmv::group_mapped(csr, x, y); 38 | timer.stop(); 39 | 40 | std::cout << "group_mapped," << mtx.dataset << "," << csr.rows << "," 41 | << csr.cols << "," << csr.nnzs << "," << timer.milliseconds() 42 | << std::endl; 43 | 44 | // Validation. 45 | if (parameters.validate) 46 | cpu::validate(parameters, csr, x, y); 47 | } -------------------------------------------------------------------------------- /examples/spmv/helpers.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file helpers.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Header file for SpMV. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | 28 | struct parameters_t { 29 | std::string filename; 30 | bool validate; 31 | bool verbose; 32 | cxxopts::Options options; 33 | 34 | /** 35 | * @brief Construct a new parameters object and parse command line arguments. 36 | * 37 | * @param argc Number of command line arguments. 38 | * @param argv Command line arguments. 39 | */ 40 | parameters_t(int argc, char** argv) 41 | : options(argv[0], "Sparse Matrix-Vector Multiplication") { 42 | // Add command line options 43 | options.add_options()("h,help", "Print help") // help 44 | ("m,market", "Matrix file", cxxopts::value()) // mtx 45 | ("validate", "CPU validation") // validate 46 | ("v,verbose", "Verbose output"); // verbose 47 | 48 | // Parse command line arguments 49 | auto result = options.parse(argc, argv); 50 | 51 | if (result.count("help") || (result.count("market") == 0)) { 52 | std::cout << options.help({""}) << std::endl; 53 | std::exit(0); 54 | } 55 | 56 | if (result.count("market") == 1) { 57 | filename = result["market"].as(); 58 | if (loops::is_market(filename)) { 59 | } else { 60 | std::cout << options.help({""}) << std::endl; 61 | std::exit(0); 62 | } 63 | } else { 64 | std::cout << options.help({""}) << std::endl; 65 | std::exit(0); 66 | } 67 | 68 | if (result.count("validate") == 1) { 69 | validate = true; 70 | } else { 71 | validate = false; 72 | } 73 | 74 | if (result.count("verbose") == 1) { 75 | verbose = true; 76 | } else { 77 | verbose = false; 78 | } 79 | } 80 | }; 81 | 82 | namespace cpu { 83 | 84 | using namespace loops; 85 | using namespace loops::memory; 86 | 87 | /** 88 | * @brief CPU SpMV implementation. 89 | * 90 | * @tparam index_t 91 | * @tparam offset_t 92 | * @tparam type_t 93 | * @param csr device CSR matrix. 94 | * @param x device input vector. 95 | * @return loops::vector_t device output vector. 96 | */ 97 | template 98 | loops::vector_t reference( 99 | loops::csr_t& csr, 100 | loops::vector_t& x) { 101 | // Copy data to CPU. 102 | loops::csr_t csr_h(csr); 103 | loops::vector_t x_h(x); 104 | loops::vector_t y_h(x_h.size()); 105 | 106 | for (auto row = 0; row < csr_h.rows; ++row) { 107 | type_t sum = 0; 108 | for (auto nz = csr_h.offsets[row]; nz < csr_h.offsets[row + 1]; ++nz) { 109 | sum += csr_h.values[nz] * x_h[csr_h.indices[nz]]; 110 | } 111 | y_h[row] = sum; 112 | } 113 | 114 | return y_h; 115 | } 116 | 117 | /** 118 | * @brief Validation for SpMV. 119 | * 120 | * @tparam index_t Column indices type. 121 | * @tparam offset_t Row offset type. 122 | * @tparam type_t Value type. 123 | * @param parameters Parameters. 124 | * @param csr CSR matrix. 125 | * @param x Input vector. 126 | * @param y Output vector. 127 | */ 128 | template 129 | void validate(parameters_t& parameters, 130 | csr_t& csr, 131 | vector_t& x, 132 | vector_t& y) { 133 | // Validation code, can be safely ignored. 134 | auto h_y = reference(csr, x); 135 | 136 | std::size_t errors = util::equal( 137 | y.data().get(), h_y.data(), csr.rows, 138 | [](const type_t a, const type_t b) { return std::abs(a - b) > 1e-2; }, 139 | parameters.verbose); 140 | 141 | std::cout << "Matrix:\t\t" << extract_filename(parameters.filename) 142 | << std::endl; 143 | std::cout << "Dimensions:\t" << csr.rows << " x " << csr.cols << " (" 144 | << csr.nnzs << ")" << std::endl; 145 | std::cout << "Errors:\t\t" << errors << std::endl; 146 | } 147 | 148 | } // namespace cpu -------------------------------------------------------------------------------- /examples/spmv/merge_path.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file merge_path.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "helpers.hxx" 13 | #include 14 | 15 | using namespace loops; 16 | 17 | int main(int argc, char** argv) { 18 | using index_t = int; 19 | using offset_t = int; 20 | using type_t = float; 21 | 22 | // ... I/O parameters, mtx, etc. 23 | parameters_t parameters(argc, argv); 24 | 25 | matrix_market_t mtx; 26 | csr_t csr(mtx.load(parameters.filename)); 27 | 28 | // Input and output vectors. 29 | vector_t x(csr.cols); 30 | vector_t y(csr.rows); 31 | 32 | // Generate random numbers between [0, 1]. 33 | generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); 34 | // thrust::fill(x.begin(), x.end(), 2); 35 | 36 | // Run the benchmark. 37 | float elapsed = 0.0f; 38 | int num_runs = 1; 39 | for (int i = 0; i < num_runs; i++) { 40 | auto timer = algorithms::spmv::merge_path_flat(csr, x, y); 41 | elapsed += timer.milliseconds(); 42 | } 43 | 44 | std::cout << "merge_path_flat," << mtx.dataset << "," << csr.rows << "," 45 | << csr.cols << "," << csr.nnzs << "," << elapsed << std::endl; 46 | 47 | // Validation. 48 | if (parameters.validate) 49 | cpu::validate(parameters, csr, x, y); 50 | } -------------------------------------------------------------------------------- /examples/spmv/original.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file original.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "helpers.hxx" 13 | #include 14 | 15 | using namespace loops; 16 | 17 | int main(int argc, char** argv) { 18 | using index_t = int; 19 | using offset_t = int; 20 | using type_t = float; 21 | 22 | // ... I/O parameters, mtx, etc. 23 | parameters_t parameters(argc, argv); 24 | 25 | matrix_market_t mtx; 26 | csr_t csr(mtx.load(parameters.filename)); 27 | 28 | // Input and output vectors. 29 | vector_t x(csr.cols); 30 | vector_t y(csr.rows); 31 | 32 | // Generate random numbers between [0, 1]. 33 | generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); 34 | 35 | // Run the benchmark. 36 | util::timer_t timer; 37 | timer.start(); 38 | algorithms::spmv::original(csr, x, y); 39 | timer.stop(); 40 | 41 | std::cout << "original," << mtx.dataset << "," << csr.rows << "," << csr.cols 42 | << "," << csr.nnzs << "," << timer.milliseconds() << std::endl; 43 | 44 | // Validation. 45 | if (parameters.validate) 46 | cpu::validate(parameters, csr, x, y); 47 | } -------------------------------------------------------------------------------- /examples/spmv/thread_mapped.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file thread_mapped.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "helpers.hxx" 13 | #include 14 | 15 | using namespace loops; 16 | 17 | int main(int argc, char** argv) { 18 | using index_t = int; 19 | using offset_t = int; 20 | using type_t = float; 21 | 22 | // ... I/O parameters, mtx, etc. 23 | parameters_t parameters(argc, argv); 24 | 25 | matrix_market_t mtx; 26 | csr_t csr(mtx.load(parameters.filename)); 27 | 28 | // Input and output vectors. 29 | vector_t x(csr.cols); 30 | vector_t y(csr.rows); 31 | 32 | // Generate random numbers between [0, 1]. 33 | generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); 34 | 35 | // Run the benchmark. 36 | util::timer_t timer; 37 | timer.start(); 38 | algorithms::spmv::thread_mapped(csr, x, y); 39 | timer.stop(); 40 | 41 | std::cout << "thread_mapped," << mtx.dataset << "," << csr.rows << "," 42 | << csr.cols << "," << csr.nnzs << "," << timer.milliseconds() 43 | << std::endl; 44 | 45 | // Validation. 46 | if (parameters.validate) 47 | cpu::validate(parameters, csr, x, y); 48 | } -------------------------------------------------------------------------------- /examples/spmv/work_oriented.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * @file work_oriented.cu 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include "helpers.hxx" 13 | #include 14 | 15 | using namespace loops; 16 | 17 | int main(int argc, char** argv) { 18 | using index_t = int; 19 | using offset_t = int; 20 | using type_t = float; 21 | 22 | // ... I/O parameters, mtx, etc. 23 | parameters_t parameters(argc, argv); 24 | 25 | matrix_market_t mtx; 26 | csr_t csr(mtx.load(parameters.filename)); 27 | 28 | // Input and output vectors. 29 | vector_t x(csr.cols); 30 | vector_t y(csr.rows); 31 | 32 | // Generate random numbers between [0, 1]. 33 | generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); 34 | 35 | // Run the benchmark. 36 | util::timer_t timer; 37 | timer.start(); 38 | algorithms::spmv::work_oriented(csr, x, y); 39 | timer.stop(); 40 | 41 | std::cout << "work_oriented," << mtx.dataset << "," << csr.rows << "," 42 | << csr.cols << "," << csr.nnzs << "," << timer.milliseconds() 43 | << std::endl; 44 | 45 | // Validation. 46 | if (parameters.validate) 47 | cpu::validate(parameters, csr, x, y); 48 | } -------------------------------------------------------------------------------- /include/loops/algorithms/spmm/thread_mapped.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file thread_mapped.cuh 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Matrix Multiplication kernels. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace loops { 25 | namespace algorithms { 26 | namespace spmm { 27 | 28 | template 32 | __global__ void __thread_mapped(setup_t config, 33 | const std::size_t a_rows, 34 | const std::size_t a_cols, 35 | const std::size_t a_nnz, 36 | const offset_t* offsets, 37 | const index_t* indices, 38 | const type_t* values, 39 | const matrix_t B, 40 | matrix_t C) { 41 | for (auto row : config.tiles()) { 42 | for (auto col : 43 | custom_stride_range(std::size_t(0), B.cols, std::size_t(1))) { 44 | type_t sum = 0; 45 | for (auto nz : config.atoms(row)) { 46 | sum += values[nz] * B(indices[nz], col); 47 | } 48 | 49 | // Output 50 | C(row, col) = sum; 51 | } 52 | } 53 | } 54 | 55 | /** 56 | * @brief Sparse-Matrix Matrix Multiplication API. 57 | * 58 | * @tparam index_t Type of column indices. 59 | * @tparam offset_t Type of row offsets. 60 | * @tparam type_t Type of values. 61 | * @param csr CSR matrix (GPU). 62 | * @param n Number of columns in the B-matrix. 63 | * @param B Input matrix B (GPU). 64 | * @param C Output matrix C (GPU). 65 | * @param stream CUDA stream. 66 | */ 67 | template 68 | void thread_mapped(csr_t& csr, 69 | matrix_t& B, 70 | matrix_t& C, 71 | cudaStream_t stream = 0) { 72 | // Create a schedule. 73 | constexpr std::size_t block_size = 128; 74 | 75 | /// Set-up kernel launch parameters and run the kernel. 76 | 77 | // Create a schedule. 78 | using setup_t = schedule::setup; 80 | setup_t config(csr.offsets.data().get(), csr.rows, csr.nnzs); 81 | 82 | std::size_t grid_size = (csr.rows + block_size - 1) / block_size; 83 | launch::non_cooperative( 84 | stream, __thread_mapped, grid_size, 85 | block_size, config, csr.rows, csr.cols, csr.nnzs, 86 | csr.offsets.data().get(), csr.indices.data().get(), 87 | csr.values.data().get(), B, C); 88 | 89 | cudaStreamSynchronize(stream); 90 | } 91 | 92 | } // namespace spmm 93 | } // namespace algorithms 94 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/algorithms/spmv/group_mapped.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file group_mapped.cuh 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication kernels. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace loops { 23 | namespace algorithms { 24 | namespace spmv { 25 | 26 | template 30 | __global__ void __launch_bounds__(threads_per_block, 2) 31 | __group_mapped(std::size_t rows, 32 | std::size_t cols, 33 | std::size_t nnz, 34 | offset_t* offsets, 35 | index_t* indices, 36 | const type_t* values, 37 | const type_t* x, 38 | type_t* y) { 39 | using setup_t = schedule::block_mapped; 40 | 41 | /// Allocate temporary storage for the schedule. 42 | using storage_t = typename setup_t::storage_t; 43 | __shared__ storage_t temporary_storage; 44 | 45 | /// Construct the schedule. 46 | setup_t config(temporary_storage, offsets, rows, nnz); 47 | auto p = config.partition(); 48 | 49 | for (auto virtual_atom : config.atom_accessor(p)) { 50 | auto virtual_tile = config.tile_accessor(virtual_atom, p); 51 | 52 | if (!(config.is_valid_accessor(virtual_tile, p))) 53 | continue; 54 | 55 | auto row = config.tile_id(virtual_tile, p); 56 | 57 | auto nz_idx = config.atom_id(virtual_atom, row, virtual_tile, p); 58 | atomicAdd(&(y[row]), values[nz_idx] * x[indices[nz_idx]]); 59 | } 60 | } 61 | 62 | /** 63 | * @brief Sparse-Matrix Vector Multiplication API. 64 | * 65 | * @tparam index_t Type of column indices. 66 | * @tparam offset_t Type of row offsets. 67 | * @tparam type_t Type of values. 68 | * @param csr CSR matrix (GPU). 69 | * @param x Input vector x (GPU). 70 | * @param y Output vector y (GPU). 71 | * @param stream CUDA stream. 72 | */ 73 | template 74 | void group_mapped(csr_t& csr, 75 | vector_t& x, 76 | vector_t& y, 77 | cudaStream_t stream = 0) { 78 | // Create a schedule. 79 | constexpr std::size_t block_size = 128; 80 | 81 | /// Set-up kernel launch parameters and run the kernel. 82 | 83 | /// Traditional kernel launch, this is nice for tile mapped scheduling, which 84 | /// will allow blocks to be scheduled in and out as needed. And will rely on 85 | /// NVIDIA's hardware schedule to schedule the blocks efficiently. 86 | std::size_t grid_size = (csr.rows + block_size - 1) / block_size; 87 | launch::non_cooperative( 88 | stream, __group_mapped, grid_size, 89 | block_size, csr.rows, csr.cols, csr.nnzs, csr.offsets.data().get(), 90 | csr.indices.data().get(), csr.values.data().get(), x.data().get(), 91 | y.data().get()); 92 | 93 | /// Cooperative kernel launch; requires a fixed number of blocks per grid to 94 | /// be launched, this number can be determined by using CUDA's occupancy API 95 | /// to figure out how many blocks will run concurrently at all times per SM. 96 | /// And then we simply loop over the entire work within the kernel. 97 | // launch::cooperative(stream, __group_mapped, 99 | // grid_size, block_size, csr.rows, csr.cols, csr.nnzs, 100 | // csr.offsets.data().get(), csr.indices.data().get(), 101 | // csr.values.data().get(), x.data().get(), 102 | // y.data().get()); 103 | 104 | cudaStreamSynchronize(stream); 105 | } 106 | 107 | } // namespace spmv 108 | } // namespace algorithms 109 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/algorithms/spmv/merge_path_flat.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file work_oriented.cuh 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | namespace loops { 26 | namespace algorithms { 27 | namespace spmv { 28 | 29 | /** 30 | * @brief Flat Merge-Path SpMV kernel. 31 | * 32 | * @tparam threads_per_block Number of threads per block. 33 | * @tparam items_per_thread Number of items per thread to process. 34 | * @tparam index_t Type of column indices. 35 | * @tparam offset_t Type of row offsets. 36 | * @tparam type_t Type of values. 37 | */ 38 | template 44 | __global__ void __launch_bounds__(int(threads_per_block)) 45 | __merge_path_flat(meta_t meta, 46 | std::size_t rows, 47 | std::size_t cols, 48 | std::size_t nnz, 49 | offset_t* offsets, 50 | index_t* indices, 51 | const type_t* values, 52 | const type_t* x, 53 | type_t* y) { 54 | using setup_t = schedule::setup; 57 | 58 | /// Allocate temporary storage for the schedule. 59 | using storage_t = typename setup_t::storage_t; 60 | __shared__ storage_t temporary_storage; 61 | 62 | /// Construct the schedule. 63 | setup_t config(meta, temporary_storage, offsets, rows, nnz); 64 | auto map = config.init(); 65 | 66 | if (!config.is_valid_accessor(map)) 67 | return; 68 | 69 | /// Flat Merge-Path loop from 0..items_per_thread. 70 | #pragma unroll 71 | for (auto item : config.virtual_idx()) { 72 | auto nz = config.atom_idx(item, map); 73 | auto row = config.tile_idx(map); 74 | type_t nonzero = values[nz] * x[indices[nz]]; 75 | if (config.atoms_counting_it[map.y] < 76 | temporary_storage.tile_end_offset[map.x]) { 77 | atomicAdd(&(y[row]), nonzero); 78 | map.y++; 79 | } else { 80 | map.x++; 81 | } 82 | } 83 | } 84 | 85 | /** 86 | * @brief Sparse-Matrix Vector Multiplication API. 87 | * 88 | * @tparam index_t Type of column indices. 89 | * @tparam offset_t Type of row offsets. 90 | * @tparam type_t Type of values. 91 | * @param csr CSR matrix (GPU). 92 | * @param x Input vector x (GPU). 93 | * @param y Output vector y (GPU). 94 | * @param stream CUDA stream. 95 | */ 96 | template 97 | util::timer_t merge_path_flat(csr_t& csr, 98 | vector_t& x, 99 | vector_t& y, 100 | cudaStream_t stream = 0) { 101 | // Create a schedule. 102 | constexpr std::size_t block_size = sizeof(type_t) > 4 ? 64 : 128; 103 | constexpr std::size_t items_per_thread = sizeof(type_t) > 4 ? 3 : 5; 104 | 105 | using preprocessor_t = 106 | schedule::merge_path::preprocess_t; 108 | 109 | /// Light-weight preprocess that does not modify the data, just creates an 110 | /// array with pre-calculated per block coordinates to reduce the work in the 111 | /// actual kernel. 112 | preprocessor_t meta(csr.offsets.data().get(), csr.rows, csr.nnzs, stream); 113 | 114 | /// Set-up kernel launch parameters and run the kernel. 115 | int max_dim_x; 116 | int num_merge_tiles = 117 | math::ceil_div(csr.rows + csr.nnzs, block_size * items_per_thread); 118 | int device_ordinal = device::get(); 119 | cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal); 120 | 121 | util::timer_t timer; 122 | timer.start(); 123 | 124 | // Launch main kernel that uses merge-path schedule. 125 | int within_bounds = min(num_merge_tiles, max_dim_x); 126 | int overflow = math::ceil_div(num_merge_tiles, max_dim_x); 127 | dim3 grid_size(within_bounds, overflow, 1); 128 | launch::non_cooperative( 129 | stream, 130 | __merge_path_flat, 132 | grid_size, block_size, meta, csr.rows, csr.cols, csr.nnzs, 133 | csr.offsets.data().get(), csr.indices.data().get(), 134 | csr.values.data().get(), x.data().get(), y.data().get()); 135 | cudaStreamSynchronize(stream); 136 | timer.stop(); 137 | 138 | return timer; 139 | } 140 | 141 | } // namespace spmv 142 | } // namespace algorithms 143 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/algorithms/spmv/original.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file original.cuh 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication kernels. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace loops { 23 | namespace algorithms { 24 | namespace spmv { 25 | 26 | template 27 | __global__ void __original(const std::size_t rows, 28 | const std::size_t cols, 29 | const std::size_t nnz, 30 | const offset_t* offsets, 31 | const index_t* indices, 32 | const type_t* values, 33 | const type_t* x, 34 | type_t* y) { 35 | for (auto row = blockIdx.x * blockDim.x + threadIdx.x; 36 | row < rows; // boundary condition 37 | row += gridDim.x * blockDim.x // step 38 | ) { 39 | type_t sum = 0; 40 | for (offset_t nz = offsets[row]; nz < offsets[row + 1]; ++nz) 41 | sum += values[nz] * x[indices[nz]]; 42 | 43 | // Output 44 | y[row] = sum; 45 | } 46 | } 47 | 48 | /** 49 | * @brief Sparse-Matrix Vector Multiplication API. 50 | * 51 | * @tparam index_t Type of column indices. 52 | * @tparam offset_t Type of row offsets. 53 | * @tparam type_t Type of values. 54 | * @param csr CSR matrix (GPU). 55 | * @param x Input vector x (GPU). 56 | * @param y Output vector y (GPU). 57 | * @param stream CUDA stream. 58 | */ 59 | template 60 | void original(csr_t& csr, 61 | vector_t& x, 62 | vector_t& y, 63 | cudaStream_t stream = 0) { 64 | // Create a schedule. 65 | constexpr std::size_t block_size = 128; 66 | 67 | /// Set-up kernel launch parameters and run the kernel. 68 | std::size_t grid_size = (csr.rows + block_size - 1) / block_size; 69 | launch::non_cooperative(stream, __original, 70 | grid_size, block_size, csr.rows, csr.cols, csr.nnzs, 71 | csr.offsets.data().get(), csr.indices.data().get(), 72 | csr.values.data().get(), x.data().get(), 73 | y.data().get()); 74 | 75 | cudaStreamSynchronize(stream); 76 | } 77 | 78 | } // namespace spmv 79 | } // namespace algorithms 80 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/algorithms/spmv/thread_mapped.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file thread_mapped.cuh 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication kernels. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace loops { 23 | namespace algorithms { 24 | namespace spmv { 25 | 26 | template 30 | __global__ void __thread_mapped(setup_t config, 31 | const std::size_t rows, 32 | const std::size_t cols, 33 | const std::size_t nnz, 34 | const offset_t* offsets, 35 | const index_t* indices, 36 | const type_t* values, 37 | const type_t* x, 38 | type_t* y) { 39 | /// Equivalent to: 40 | /// row = blockIdx.x * blockDim.x + threadIdx.x; (init) 41 | /// row < rows; (boundary condition) 42 | /// row += gridDim.x * blockDim.x. (step) 43 | for (auto row : config.tiles()) { 44 | type_t sum = 0; 45 | 46 | /// Equivalent to: 47 | /// for (offset_t nz = offset; nz < end; ++nz) 48 | for (auto nz : config.atoms(row)) { 49 | sum += values[nz] * x[indices[nz]]; 50 | } 51 | 52 | // Output 53 | y[row] = sum; 54 | } 55 | } 56 | 57 | /** 58 | * @brief Sparse-Matrix Vector Multiplication API. 59 | * 60 | * @tparam index_t Type of column indices. 61 | * @tparam offset_t Type of row offsets. 62 | * @tparam type_t Type of values. 63 | * @param csr CSR matrix (GPU). 64 | * @param x Input vector x (GPU). 65 | * @param y Output vector y (GPU). 66 | * @param stream CUDA stream. 67 | */ 68 | template 69 | void thread_mapped(csr_t& csr, 70 | vector_t& x, 71 | vector_t& y, 72 | cudaStream_t stream = 0) { 73 | // Create a schedule. 74 | constexpr std::size_t block_size = 128; 75 | 76 | /// Set-up kernel launch parameters and run the kernel. 77 | 78 | // Create a schedule. 79 | using setup_t = schedule::setup; 81 | setup_t config(csr.offsets.data().get(), csr.rows, csr.nnzs); 82 | 83 | std::size_t grid_size = (csr.rows + block_size - 1) / block_size; 84 | launch::non_cooperative( 85 | stream, __thread_mapped, grid_size, 86 | block_size, config, csr.rows, csr.cols, csr.nnzs, 87 | csr.offsets.data().get(), csr.indices.data().get(), 88 | csr.values.data().get(), x.data().get(), y.data().get()); 89 | 90 | cudaStreamSynchronize(stream); 91 | } 92 | 93 | } // namespace spmv 94 | } // namespace algorithms 95 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/algorithms/spmv/work_oriented.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file work_oriented.cuh 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Sparse Matrix-Vector Multiplication example. 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace loops { 23 | namespace algorithms { 24 | namespace spmv { 25 | 26 | /** 27 | * @brief Work oriented SpMV kernel. 28 | * 29 | * @tparam threads_per_block Number of threads per block. 30 | * @tparam index_t Type of column indices. 31 | * @tparam offset_t Type of row offsets. 32 | * @tparam type_t Type of values. 33 | */ 34 | template 38 | __global__ void __launch_bounds__(threads_per_block, 2) 39 | __work_oriented(std::size_t rows, 40 | std::size_t cols, 41 | std::size_t nnz, 42 | offset_t* offsets, 43 | index_t* indices, 44 | const type_t* values, 45 | const type_t* x, 46 | type_t* y) { 47 | using setup_t = 48 | schedule::setup; 50 | 51 | setup_t config(offsets, rows, nnz); 52 | auto map = config.init(); 53 | 54 | /// Accumulate the complete tiles. 55 | type_t sum = 0; 56 | for (auto row : config.tiles(map)) { 57 | for (auto nz : config.atoms(row, map)) { 58 | sum += values[nz] * x[indices[nz]]; 59 | } 60 | y[row] = sum; 61 | sum = 0; 62 | } 63 | 64 | // Interesting use of syncthreads to ensure all remaining tiles get processed 65 | // at the same time, possibly causing less thread divergence among the threads 66 | // in the same warp. 67 | __syncthreads(); 68 | 69 | /// Process remaining tiles. 70 | for (auto row : config.remainder_tiles(map)) { 71 | for (auto nz : config.remainder_atoms(map)) { 72 | sum += values[nz] * x[indices[nz]]; 73 | } 74 | /// Accumulate the remainder. 75 | if (sum != 0) 76 | atomicAdd(&(y[row]), sum); 77 | } 78 | } 79 | 80 | /** 81 | * @brief Sparse-Matrix Vector Multiplication API. 82 | * 83 | * @tparam index_t Type of column indices. 84 | * @tparam offset_t Type of row offsets. 85 | * @tparam type_t Type of values. 86 | * @param csr CSR matrix (GPU). 87 | * @param x Input vector x (GPU). 88 | * @param y Output vector y (GPU). 89 | * @param stream CUDA stream. 90 | */ 91 | template 92 | void work_oriented(csr_t& csr, 93 | vector_t& x, 94 | vector_t& y, 95 | cudaStream_t stream = 0) { 96 | // Create a schedule. 97 | constexpr std::size_t block_size = 128; 98 | 99 | /// Set-up kernel launch parameters and run the kernel. 100 | 101 | /// Launch 2 x (SM Count) number of blocks. 102 | /// Weirdly enough, a really high number here might cause it to fail. 103 | loops::device::properties_t props; 104 | std::size_t grid_size = 2 * props.multi_processor_count(); 105 | 106 | launch::non_cooperative( 107 | stream, __work_oriented, grid_size, 108 | block_size, csr.rows, csr.cols, csr.nnzs, csr.offsets.data().get(), 109 | csr.indices.data().get(), csr.values.data().get(), x.data().get(), 110 | y.data().get()); 111 | 112 | cudaStreamSynchronize(stream); 113 | } 114 | 115 | } // namespace spmv 116 | } // namespace algorithms 117 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/container.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file container.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Includes all the container headers. 5 | * @version 0.1 6 | * @date 2022-07-25 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include -------------------------------------------------------------------------------- /include/loops/container/coo.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file coo.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Interface for Coordinate format. 5 | * @version 0.1 6 | * @date 2022-07-21 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace loops { 26 | 27 | using namespace memory; 28 | 29 | /** 30 | * @brief Coordinate (COO) format. 31 | * 32 | * @tparam index_t 33 | * @tparam value_t 34 | */ 35 | template 38 | struct coo_t { 39 | std::size_t rows; 40 | std::size_t cols; 41 | std::size_t nnzs; 42 | 43 | vector_t row_indices; /// I 44 | vector_t col_indices; /// J 45 | vector_t values; /// V 46 | 47 | /** 48 | * @brief Construct a new coo object with everything initialized to zero. 49 | * 50 | */ 51 | coo_t() : rows(0), cols(0), nnzs(0), row_indices(), col_indices(), values() {} 52 | 53 | /** 54 | * @brief Construct a new coo object with the given dimensions. 55 | * 56 | * @param r Number of rows. 57 | * @param c Number of columns. 58 | * @param nnz Number of non-zero elements. 59 | */ 60 | coo_t(std::size_t r, std::size_t c, std::size_t nnz) 61 | : rows(r), 62 | cols(c), 63 | nnzs(nnz), 64 | row_indices(nnz), 65 | col_indices(nnz), 66 | values(nnz) {} 67 | 68 | /** 69 | * @brief Construct a new coo from another coo object on host/device. 70 | * 71 | * @param rhs coo_t 72 | */ 73 | template 74 | coo_t(const coo_t& rhs) 75 | : rows(rhs.rows), 76 | cols(rhs.cols), 77 | nnzs(rhs.nnzs), 78 | row_indices(rhs.row_indices), 79 | col_indices(rhs.col_indices), 80 | values(rhs.values) {} 81 | 82 | /** 83 | * @brief Construct a new coo object from compressed sparse format (CSR). 84 | * 85 | * @param csr csr_t 86 | */ 87 | template 88 | coo_t(const csr_t& csr) 89 | : rows(csr.rows), 90 | cols(csr.cols), 91 | nnzs(csr.nnzs), 92 | row_indices(csr.nnzs), 93 | col_indices(csr.col_indices), 94 | values(csr.values) { 95 | /// TODO: Do not need this copy for all cases. 96 | vector_t _row_offsets = csr.offsets; 97 | detail::offsets_to_indices(_row_offsets, row_indices); 98 | } 99 | 100 | /** 101 | * @brief Sorts the coordinate matrix by row indices. 102 | * 103 | */ 104 | void sort_by_row() { 105 | auto begin = thrust::make_zip_iterator( 106 | thrust::make_tuple(row_indices.begin(), col_indices.begin())); 107 | auto end = thrust::make_zip_iterator( 108 | thrust::make_tuple(row_indices.end(), col_indices.end())); 109 | sort(begin, end); 110 | } 111 | 112 | /** 113 | * @brief Sorts the coordinate matrix by column indices. 114 | * 115 | */ 116 | void sort_by_column() { 117 | auto begin = thrust::make_zip_iterator( 118 | thrust::make_tuple(col_indices.begin(), row_indices.begin())); 119 | auto end = thrust::make_zip_iterator( 120 | thrust::make_tuple(col_indices.end(), row_indices.end())); 121 | sort(begin, end); 122 | } 123 | 124 | /** 125 | * @brief Sorts, removes I,J pairs. 126 | * 127 | */ 128 | void remove_duplicates() { 129 | // Sort by row indices. 130 | this->sort_by_row(); 131 | auto begin = thrust::make_zip_iterator( 132 | thrust::make_tuple(row_indices.begin(), col_indices.begin())); 133 | auto end = thrust::make_zip_iterator( 134 | thrust::make_tuple(row_indices.end(), col_indices.end())); 135 | 136 | // Remove duplicates. 137 | auto new_it = thrust::unique_by_key(begin, end, values.begin()); 138 | auto first_it = thrust::get<1>(new_it); 139 | nnzs = thrust::distance(values.begin(), first_it); 140 | 141 | // Resize vectors to new size. 142 | row_indices.resize(nnzs); 143 | col_indices.resize(nnzs); 144 | values.resize(nnzs); 145 | } 146 | 147 | private: 148 | /** 149 | * @brief Sorting helper, uses zip iterator as begin and end. 150 | * 151 | * @par Example Zip iterator: 152 | * auto begin = thrust::make_zip_iterator( 153 | * thrust::make_tuple(col_indices.begin(), row_indices.begin())); 154 | * auto end = thrust::make_zip_iterator( 155 | * thrust::make_tuple(col_indices.end(), row_indices.end())); 156 | * 157 | * @tparam begin_it_t Begin iterator type. 158 | * @tparam end_it_t End iterator type. 159 | * @param begin Begin iterator (zip iterator of row and col indices). 160 | * @param end End iterator (zip iterator of row and col indices). 161 | */ 162 | template 163 | void sort(begin_it_t& begin, end_it_t& end) { 164 | thrust::sort_by_key(begin, end, values.begin()); 165 | } 166 | }; // struct coo_t 167 | 168 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/coordinate.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file coordinate.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Simple coordinate with x and y. 5 | * @version 0.1 6 | * @date 2022-11-12 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | namespace loops { 14 | template 15 | struct coordinate_t { 16 | index_t x; 17 | index_t y; 18 | }; 19 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/csc.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file csc.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Interface for Compressed Sparse-Column format. 5 | * @version 0.1 6 | * @date 2022-07-21 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | 17 | namespace loops { 18 | 19 | using namespace memory; 20 | 21 | /** 22 | * @brief Compressed Sparse Column (CSC) format. 23 | * 24 | * @tparam index_t 25 | * @tparam offset_t 26 | * @tparam value_t 27 | */ 28 | template 32 | struct csc_t { 33 | std::size_t rows; 34 | std::size_t cols; 35 | std::size_t nnzs; 36 | 37 | vector_t offsets; /// Aj 38 | vector_t indices; /// Ap 39 | vector_t values; /// Ax 40 | 41 | /** 42 | * @brief Construct a new csc object with everything initialized to zero. 43 | * 44 | */ 45 | csc_t() : rows(0), cols(0), nnzs(0), offsets(), indices(), values() {} 46 | 47 | /** 48 | * @brief Construct a new csc object with the given dimensions. 49 | * 50 | * @param r Number of rows. 51 | * @param c Number of columns. 52 | * @param nnz Number of non-zero elements. 53 | */ 54 | csc_t(std::size_t r, std::size_t c, std::size_t nnz) 55 | : rows(r), 56 | cols(c), 57 | nnzs(nnz), 58 | offsets(r + 1), 59 | indices(nnz), 60 | values(nnz) {} 61 | 62 | /** 63 | * @brief Construct a new csc from another csc object on host/device. 64 | * 65 | * @param rhs csc_t 66 | */ 67 | template 68 | csc_t(const csc_t& rhs) 69 | : rows(rhs.rows), 70 | cols(rhs.cols), 71 | nnzs(rhs.nnzs), 72 | offsets(rhs.offsets), 73 | indices(rhs.indices), 74 | values(rhs.values) {} 75 | 76 | /** 77 | * @brief Construct a new csc object from coordinate format (COO). 78 | * @note This constructor creates a copy of the input COO matrix. 79 | * 80 | * @param coo coo_t 81 | */ 82 | template 83 | csc_t(const coo_t& coo) 84 | : rows(coo.rows), cols(coo.cols), nnzs(coo.nnzs), offsets(coo.cols + 1) { 85 | coo_t __(coo); 86 | __.sort_by_column(); 87 | indices = std::move(__.row_indices); 88 | values = std::move(__.values); 89 | detail::indices_to_offsets(__.col_indices, offsets); 90 | } 91 | 92 | }; // struct csc_t 93 | 94 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/csr.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file csr.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Interface for Compressed Sparse-Row format. 5 | * @version 0.1 6 | * @date 2022-07-21 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | namespace loops { 22 | 23 | using namespace memory; 24 | 25 | /** 26 | * @brief Compressed Sparse Row (CSR) format. 27 | * 28 | * @tparam index_t Type of the nonzero elements indices. 29 | * @tparam offset_t Type of the row offsets. 30 | * @tparam value_t Type of the nonzero elements values. 31 | */ 32 | template 36 | struct csr_t { 37 | std::size_t rows; 38 | std::size_t cols; 39 | std::size_t nnzs; 40 | 41 | vector_t offsets; /// Ap 42 | vector_t indices; /// Aj 43 | vector_t values; /// Ax 44 | 45 | /** 46 | * @brief Construct a new csr object with everything initialized to zero. 47 | * 48 | */ 49 | csr_t() : rows(0), cols(0), nnzs(0), offsets(), indices(), values() {} 50 | 51 | /** 52 | * @brief Construct a new csr object with the given dimensions. 53 | * 54 | * @param r Number of rows. 55 | * @param c Number of columns. 56 | * @param nnz Number of non-zero elements. 57 | */ 58 | csr_t(std::size_t r, std::size_t c, std::size_t nnz) 59 | : rows(r), 60 | cols(c), 61 | nnzs(nnz), 62 | offsets(r + 1), 63 | indices(nnz), 64 | values(nnz) {} 65 | 66 | /** 67 | * @brief Construct a new csr from another csr object on host/device. 68 | * 69 | * @param rhs csr_t 70 | */ 71 | template 72 | csr_t(const csr_t& rhs) 73 | : rows(rhs.rows), 74 | cols(rhs.cols), 75 | nnzs(rhs.nnzs), 76 | offsets(rhs.offsets), 77 | indices(rhs.indices), 78 | values(rhs.values) {} 79 | 80 | /** 81 | * @brief Construct a new csr object from coordinate format (COO). 82 | * @note This constructor creates a copy of the input COO matrix. 83 | * 84 | * @param coo coo_t 85 | */ 86 | template 87 | csr_t(const coo_t& coo) 88 | : rows(coo.rows), cols(coo.cols), nnzs(coo.nnzs), offsets(coo.rows + 1) { 89 | coo_t __(coo); 90 | __.sort_by_row(); 91 | indices = std::move(__.col_indices); 92 | values = std::move(__.values); 93 | detail::indices_to_offsets(__.row_indices, offsets); 94 | } 95 | }; // struct csr_t 96 | 97 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/detail/convert.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file convert.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Conversion functions for containers. 5 | * @version 0.1 6 | * @date 2022-07-19 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace loops { 25 | namespace detail { 26 | using namespace memory; 27 | 28 | /** 29 | * @brief Convert offsets to indices. 30 | * 31 | * @tparam index_v_t The type of vector indices. 32 | * @tparam offset_v_t The type of vector offsets. 33 | * @param offsets The offsets. 34 | * @param indices The indices. 35 | */ 36 | template 37 | void offsets_to_indices(const offset_v_t& offsets, index_v_t& indices) { 38 | using offset_t = typename offset_v_t::value_type; 39 | using index_t = typename index_v_t::value_type; 40 | 41 | // Convert compressed offsets into uncompressed indices. 42 | thrust::fill(indices.begin(), indices.end(), offset_t(0)); 43 | 44 | thrust::scatter_if( 45 | thrust::counting_iterator(0), // begin iterator 46 | thrust::counting_iterator(offsets.size() - 1), // end iterator 47 | offsets.begin(), // where to scatter 48 | thrust::make_transform_iterator( 49 | thrust::make_zip_iterator( 50 | thrust::make_tuple(offsets.begin(), offsets.begin() + 1)), 51 | [=] __host__ __device__(const thrust::tuple& t) { 52 | thrust::not_equal_to comp; 53 | return comp(thrust::get<0>(t), thrust::get<1>(t)); 54 | }), 55 | indices.begin()); 56 | 57 | thrust::inclusive_scan(indices.begin(), indices.end(), indices.begin(), 58 | thrust::maximum()); 59 | } 60 | 61 | /** 62 | * @brief Converts "indices"-based array to "offsets"-based array. 63 | * 64 | * @tparam index_v_t The type of vector indices. 65 | * @tparam offset_v_t The type of vector offsets. 66 | * @param indices The indices. 67 | * @param offsets The offsets. 68 | */ 69 | template 70 | void indices_to_offsets(const index_v_t& indices, offset_v_t& offsets) { 71 | using offset_t = typename offset_v_t::value_type; 72 | using index_t = typename index_v_t::value_type; 73 | 74 | // Convert uncompressed indices into compressed offsets. 75 | thrust::lower_bound( 76 | indices.begin(), indices.end(), thrust::counting_iterator(0), 77 | thrust::counting_iterator(offsets.size()), offsets.begin()); 78 | } 79 | 80 | } // namespace detail 81 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/detail/mmio.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file mmio.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Matrix-Market file format header file, see mmio.cpp for implementation 5 | * details. 6 | * @version 0.1 7 | * @date 2020-10-12 8 | * 9 | * @copyright Copyright (c) 2020 10 | * 11 | */ 12 | 13 | #pragma once 14 | 15 | #if defined(__cplusplus) 16 | extern "C" { 17 | #endif 18 | 19 | /* 20 | * Matrix Market I/O library for ANSI C 21 | * 22 | * See http://math.nist.gov/MatrixMarket for details. 23 | * 24 | * 25 | */ 26 | 27 | #ifndef MM_IO_H 28 | #define MM_IO_H 29 | 30 | #define MM_MAX_LINE_LENGTH 1025 31 | #define MatrixMarketBanner "%%MatrixMarket" 32 | #define MM_MAX_TOKEN_LENGTH 64 33 | 34 | typedef char MM_typecode[4]; 35 | 36 | char* mm_typecode_to_str(MM_typecode matcode); 37 | 38 | int mm_read_banner(FILE* f, MM_typecode* matcode); 39 | int mm_read_mtx_crd_size(FILE* f, 40 | std::size_t* M, 41 | std::size_t* N, 42 | std::size_t* nz); 43 | 44 | int mm_read_mtx_array_size(FILE* f, std::size_t* M, std::size_t* N); 45 | 46 | int mm_write_banner(FILE* f, MM_typecode matcode); 47 | int mm_write_mtx_crd_size(FILE* f, 48 | std::size_t M, 49 | std::size_t N, 50 | std::size_t nz); 51 | int mm_write_mtx_array_size(FILE* f, std::size_t M, std::size_t N); 52 | 53 | /********************* MM_typecode query fucntions ***************************/ 54 | 55 | #define mm_is_matrix(typecode) ((typecode)[0] == 'M') 56 | 57 | #define mm_is_sparse(typecode) ((typecode)[1] == 'C') 58 | #define mm_is_coordinate(typecode) ((typecode)[1] == 'C') 59 | #define mm_is_dense(typecode) ((typecode)[1] == 'A') 60 | #define mm_is_array(typecode) ((typecode)[1] == 'A') 61 | 62 | #define mm_is_complex(typecode) ((typecode)[2] == 'C') 63 | #define mm_is_real(typecode) ((typecode)[2] == 'R') 64 | #define mm_is_pattern(typecode) ((typecode)[2] == 'P') 65 | #define mm_is_integer(typecode) ((typecode)[2] == 'I') 66 | 67 | #define mm_is_symmetric(typecode) ((typecode)[3] == 'S') 68 | #define mm_is_general(typecode) ((typecode)[3] == 'G') 69 | #define mm_is_skew(typecode) ((typecode)[3] == 'K') 70 | #define mm_is_hermitian(typecode) ((typecode)[3] == 'H') 71 | 72 | int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ 73 | 74 | /********************* MM_typecode modify fucntions ***************************/ 75 | 76 | #define mm_set_matrix(typecode) ((*typecode)[0] = 'M') 77 | #define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') 78 | #define mm_set_array(typecode) ((*typecode)[1] = 'A') 79 | #define mm_set_dense(typecode) mm_set_array(typecode) 80 | #define mm_set_sparse(typecode) mm_set_coordinate(typecode) 81 | 82 | #define mm_set_complex(typecode) ((*typecode)[2] = 'C') 83 | #define mm_set_real(typecode) ((*typecode)[2] = 'R') 84 | #define mm_set_pattern(typecode) ((*typecode)[2] = 'P') 85 | #define mm_set_integer(typecode) ((*typecode)[2] = 'I') 86 | 87 | #define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') 88 | #define mm_set_general(typecode) ((*typecode)[3] = 'G') 89 | #define mm_set_skew(typecode) ((*typecode)[3] = 'K') 90 | #define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') 91 | 92 | #define mm_clear_typecode(typecode) \ 93 | ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') 94 | 95 | #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) 96 | 97 | /********************* Matrix Market error codes ***************************/ 98 | 99 | #define MM_COULD_NOT_READ_FILE 11 100 | #define MM_PREMATURE_EOF 12 101 | #define MM_NOT_MTX 13 102 | #define MM_NO_HEADER 14 103 | #define MM_UNSUPPORTED_TYPE 15 104 | #define MM_LINE_TOO_LONG 16 105 | #define MM_COULD_NOT_WRITE_FILE 17 106 | 107 | /** 108 | * @brief Matrix Market internal definitions. 109 | * 110 | * MM_matrix_typecode: 4-character sequence 111 | * 112 | * | | ojbect | sparse/dense | data type | storage scheme | 113 | * |-----------------|----------|--------------|-----------|----------------| 114 | * | string position | [0] | [1] | [2] | [3] | 115 | * | Matrix typecode | M(atrix) | C(oord) | R(eal) | G(eneral) | 116 | * | | | A(rray) | C(omplex) | H(ermitian) | 117 | * | | | | P(attern) | S(ymmetric) | 118 | * | | | | I(nteger) | K(kew) | 119 | * 120 | */ 121 | #define MM_MTX_STR "matrix" 122 | #define MM_ARRAY_STR "array" 123 | #define MM_DENSE_STR "array" 124 | #define MM_COORDINATE_STR "coordinate" 125 | #define MM_SPARSE_STR "coordinate" 126 | #define MM_COMPLEX_STR "complex" 127 | #define MM_REAL_STR "real" 128 | #define MM_INT_STR "integer" 129 | #define MM_GENERAL_STR "general" 130 | #define MM_SYMM_STR "symmetric" 131 | #define MM_HERM_STR "hermitian" 132 | #define MM_SKEW_STR "skew-symmetric" 133 | #define MM_PATTERN_STR "pattern" 134 | 135 | /* high level routines */ 136 | 137 | int mm_write_mtx_crd(char fname[], 138 | std::size_t M, 139 | std::size_t N, 140 | std::size_t nz, 141 | std::size_t I[], 142 | std::size_t J[], 143 | double val[], 144 | MM_typecode matcode); 145 | 146 | int mm_read_mtx_crd_data(FILE* f, 147 | std::size_t M, 148 | std::size_t N, 149 | std::size_t nz, 150 | std::size_t I[], 151 | std::size_t J[], 152 | double val[], 153 | MM_typecode matcode); 154 | 155 | int mm_read_mtx_crd_entry(FILE* f, 156 | std::size_t* I, 157 | std::size_t* J, 158 | double* real, 159 | double* img, 160 | MM_typecode matcode); 161 | 162 | int mm_read_unsymmetric_sparse(const char* fname, 163 | std::size_t* M_, 164 | std::size_t* N_, 165 | std::size_t* nz_, 166 | double** val_, 167 | std::size_t** I_, 168 | std::size_t** J_); 169 | 170 | #endif 171 | 172 | #if defined(__cplusplus) 173 | } 174 | #endif -------------------------------------------------------------------------------- /include/loops/container/formats.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file formats.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief 5 | * @version 0.1 6 | * @date 2020-10-05 7 | * 8 | * @copyright Copyright (c) 2020 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | 16 | namespace loops { 17 | 18 | using namespace memory; 19 | 20 | // Forward decleration 21 | template 22 | struct coo_t; 23 | 24 | template 28 | struct csr_t; 29 | 30 | template 34 | struct csc_t; 35 | 36 | } // namespace loops 37 | 38 | #include 39 | #include 40 | #include -------------------------------------------------------------------------------- /include/loops/container/market.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matrix_market.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Matrix Market format reader. 5 | * @see http://math.nist.gov/MatrixMarket/ 6 | * @version 0.1 7 | * @date 2020-10-09 8 | * 9 | * @copyright Copyright (c) 2020 10 | * 11 | */ 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace loops { 26 | 27 | using namespace memory; 28 | 29 | /** 30 | * @brief Matrix Market format supports two kind of formats, a sparse coordinate 31 | * format and a dense array format. 32 | * 33 | */ 34 | enum matrix_market_format_t { coordinate, array }; 35 | 36 | /** 37 | * @brief Data type defines the type of data presented in the file, things like, 38 | * are they real numbers, complex (real and imaginary), pattern (do not have 39 | * weights/nonzero-values), etc. 40 | * 41 | */ 42 | enum matrix_market_data_t { real, complex, pattern, integer }; 43 | 44 | /** 45 | * @brief Storage scheme defines the storage structure, symmetric matrix for 46 | * example will be symmetric over the diagonal. Skew is skew symmetric. Etc. 47 | * 48 | */ 49 | enum matrix_market_storage_scheme_t { general, hermitian, symmetric, skew }; 50 | 51 | /** 52 | * @brief Reads a MARKET graph from an input-stream 53 | * into a specified sparse format 54 | * 55 | * Here is an example of the matrix market format 56 | * +----------------------------------------------+ 57 | * |%%MatrixMarket matrix coordinate real general | <--- header line 58 | * |% | <--+ 59 | * |% comments | |-- 0 or more comments 60 | * |% | <--+ 61 | * | M N L | <--- rows, columns, entries 62 | * | I1 J1 A(I1, J1) | <--+ 63 | * | I2 J2 A(I2, J2) | | 64 | * | I3 J3 A(I3, J3) | |-- L lines 65 | * | . . . | | 66 | * | IL JL A(IL, JL) | <--+ 67 | * +----------------------------------------------+ 68 | * 69 | * Indices are 1-based i.2. A(1,1) is the first element. 70 | */ 71 | template 72 | struct matrix_market_t { 73 | // typedef FILE* file_t; 74 | // typedef MM_typecode matrix_market_code_t; 75 | 76 | using file_t = FILE*; 77 | using matrix_market_code_t = MM_typecode; 78 | 79 | std::string filename; 80 | std::string dataset; 81 | 82 | // Dataset characteristics 83 | matrix_market_code_t code; // (ALL INFORMATION) 84 | matrix_market_format_t format; // Sparse coordinate or dense array 85 | matrix_market_data_t data; // Data type 86 | matrix_market_storage_scheme_t scheme; // Storage scheme 87 | 88 | matrix_market_t() {} 89 | ~matrix_market_t() {} 90 | 91 | /** 92 | * @brief Loads the given .mtx file into a coordinate format, and returns the 93 | * coordinate array. This needs to be further extended to support dense 94 | * arrays, those are the only two formats mtx are written in. 95 | * 96 | * @param _filename input file name (.mtx) 97 | * @return coordinate sparse format 98 | */ 99 | auto load(std::string _filename) { 100 | filename = _filename; 101 | dataset = extract_dataset(extract_filename(filename)); 102 | 103 | file_t file; 104 | 105 | // Load MTX information 106 | if ((file = fopen(filename.c_str(), "r")) == NULL) { 107 | std::cerr << "File could not be opened: " << filename << std::endl; 108 | exit(1); 109 | } 110 | 111 | if (mm_read_banner(file, &code) != 0) { 112 | std::cerr << "Could not process Matrix Market banner" << std::endl; 113 | exit(1); 114 | } 115 | 116 | std::size_t num_rows, num_columns, num_nonzeros; 117 | if ((mm_read_mtx_crd_size(file, &num_rows, &num_columns, &num_nonzeros)) != 118 | 0) { 119 | std::cerr << "Could not read file info (M, N, NNZ)" << std::endl; 120 | exit(1); 121 | } 122 | 123 | error::throw_if_exception( 124 | num_rows >= std::numeric_limits::max() || 125 | num_columns >= std::numeric_limits::max(), 126 | "index_t overflow"); 127 | error::throw_if_exception( 128 | num_nonzeros >= std::numeric_limits::max(), 129 | "offset_t overflow"); 130 | 131 | // mtx are generally written as coordinate format 132 | coo_t coo( 133 | (index_t)num_rows, (index_t)num_columns, (offset_t)num_nonzeros); 134 | 135 | if (mm_is_coordinate(code)) 136 | format = matrix_market_format_t::coordinate; 137 | else 138 | format = matrix_market_format_t::array; 139 | 140 | if (mm_is_pattern(code)) { 141 | data = matrix_market_data_t::pattern; 142 | 143 | // pattern matrix defines sparsity pattern, but not values 144 | for (index_t i = 0; i < num_nonzeros; ++i) { 145 | std::size_t row_index{0}, col_index{0}; 146 | auto num_assigned = fscanf(file, " %zu %zu \n", &row_index, &col_index); 147 | error::throw_if_exception(num_assigned != 2, 148 | "Could not read edge from market file"); 149 | error::throw_if_exception(row_index == 0, 150 | "Market file is zero-indexed"); 151 | error::throw_if_exception(col_index == 0, 152 | "Market file is zero-indexed"); 153 | // set and adjust from 1-based to 0-based indexing 154 | coo.row_indices[i] = (index_t)row_index - 1; 155 | coo.col_indices[i] = (index_t)col_index - 1; 156 | coo.values[i] = (type_t)1.0; // use value 1.0 for all nonzero entries 157 | } 158 | } else if (mm_is_real(code) || mm_is_integer(code)) { 159 | if (mm_is_real(code)) 160 | data = matrix_market_data_t::real; 161 | else 162 | data = matrix_market_data_t::integer; 163 | 164 | for (index_t i = 0; i < coo.nnzs; ++i) { 165 | std::size_t row_index{0}, col_index{0}; 166 | double weight{0.0}; 167 | 168 | auto num_assigned = 169 | fscanf(file, " %zu %zu %lf \n", &row_index, &col_index, &weight); 170 | 171 | error::throw_if_exception( 172 | num_assigned != 3, "Could not read weighted edge from market file"); 173 | error::throw_if_exception(row_index == 0, 174 | "Market file is zero-indexed"); 175 | error::throw_if_exception(col_index == 0, 176 | "Market file is zero-indexed"); 177 | 178 | coo.row_indices[i] = (index_t)row_index - 1; 179 | coo.col_indices[i] = (index_t)col_index - 1; 180 | coo.values[i] = (type_t)weight; 181 | } 182 | } else { 183 | std::cerr << "Unrecognized matrix market format type" << std::endl; 184 | exit(1); 185 | } 186 | 187 | if (mm_is_symmetric(code)) { // duplicate off diagonal entries 188 | scheme = matrix_market_storage_scheme_t::symmetric; 189 | index_t off_diagonals = 0; 190 | for (index_t i = 0; i < coo.nnzs; ++i) { 191 | if (coo.row_indices[i] != coo.col_indices[i]) 192 | ++off_diagonals; 193 | } 194 | 195 | index_t _nonzeros = 2 * off_diagonals + (coo.nnzs - off_diagonals); 196 | 197 | vector_t _I(_nonzeros); 198 | vector_t _J(_nonzeros); 199 | vector_t _V(_nonzeros); 200 | 201 | index_t ptr = 0; 202 | for (index_t i = 0; i < coo.nnzs; ++i) { 203 | if (coo.row_indices[i] != coo.col_indices[i]) { 204 | _I[ptr] = coo.row_indices[i]; 205 | _J[ptr] = coo.col_indices[i]; 206 | _V[ptr] = coo.values[i]; 207 | ++ptr; 208 | _J[ptr] = coo.row_indices[i]; 209 | _I[ptr] = coo.col_indices[i]; 210 | _V[ptr] = coo.values[i]; 211 | ++ptr; 212 | } else { 213 | _I[ptr] = coo.row_indices[i]; 214 | _J[ptr] = coo.col_indices[i]; 215 | _V[ptr] = coo.values[i]; 216 | ++ptr; 217 | } 218 | } 219 | coo.row_indices = _I; 220 | coo.col_indices = _J; 221 | coo.values = _V; 222 | coo.nnzs = _nonzeros; 223 | } // end symmetric case 224 | 225 | fclose(file); 226 | return coo; 227 | } 228 | }; 229 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/matrix.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace loops { 7 | 8 | template 9 | struct matrix_t { 10 | std::size_t rows; 11 | std::size_t cols; 12 | 13 | vector_t m_data; 14 | value_t* m_data_ptr; 15 | 16 | matrix_t() : rows(0), cols(0), m_data(), m_data_ptr(nullptr) {} 17 | 18 | matrix_t(std::size_t r, std::size_t c) 19 | : rows(r), 20 | cols(c), 21 | m_data(r * c), 22 | m_data_ptr(memory::raw_pointer_cast(m_data.data())) {} 23 | 24 | __host__ __device__ matrix_t(const matrix_t& other) 25 | : rows(other.rows), cols(other.cols), m_data_ptr(other.m_data_ptr) {} 26 | 27 | __host__ __device__ __forceinline__ value_t operator()(int r, int c) const { 28 | std::size_t idx = (cols * r) + c; 29 | return m_data_ptr[idx]; 30 | } 31 | 32 | __host__ __device__ __forceinline__ value_t& operator()(int r, int c) { 33 | std::size_t idx = (cols * r) + c; 34 | return m_data_ptr[idx]; 35 | } 36 | 37 | __host__ __device__ __forceinline__ value_t 38 | operator[](std::size_t index) const { 39 | std::size_t r = index / cols; 40 | std::size_t c = index % cols; 41 | std::size_t idx = (cols * r) + c; 42 | return m_data_ptr[idx]; 43 | } 44 | 45 | __host__ __device__ __forceinline__ value_t& operator[](std::size_t index) { 46 | std::size_t r = index / cols; 47 | std::size_t c = index % cols; 48 | std::size_t idx = (cols * r) + c; 49 | return m_data_ptr[idx]; 50 | } 51 | }; 52 | 53 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/container/vector.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file vector.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief support for stl based vectors on gpu. Relies on thrust::host and 5 | * device vectors. 6 | * 7 | * @version 0.1 8 | * @date 2022-02-03 9 | * 10 | * @copyright Copyright (c) 2022 11 | * 12 | */ 13 | 14 | #pragma once 15 | 16 | #include 17 | 18 | // includes: thrust 19 | #include 20 | #include 21 | 22 | namespace loops { 23 | using namespace memory; 24 | 25 | /** 26 | * @brief vector container for GPU and CPU. 27 | * 28 | * @tparam type_t data type of the vector 29 | * @tparam space (@see loops::memory_space_t) 30 | */ 31 | template 33 | using vector_t = 34 | std::conditional_t, // host_type 36 | thrust::device_vector // device_type 37 | >; 38 | 39 | template 40 | using host_vector_t = thrust::host_vector; 41 | template 42 | using device_vector_t = thrust::device_vector; 43 | 44 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/error.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace loops { 8 | 9 | /** 10 | * @namespace error 11 | * Error utilities for exception handling within device and host code. 12 | */ 13 | namespace error { 14 | 15 | typedef cudaError_t error_t; 16 | 17 | /** 18 | * @brief Exception class for errors in device code. 19 | * 20 | */ 21 | struct exception_t : std::exception { 22 | std::string report; 23 | 24 | exception_t(error_t _status, std::string _message = "") { 25 | report = cudaGetErrorString(_status) + std::string("\t: ") + _message; 26 | } 27 | 28 | exception_t(std::string _message = "") { report = _message; } 29 | virtual const char* what() const noexcept { return report.c_str(); } 30 | }; 31 | 32 | /** 33 | * @brief Throw an exception if the given error code is not cudaSuccess. 34 | * 35 | * @param status error_t error code (equivalent to cudaError_t). 36 | * @param message custom message to be appended to the error message. 37 | */ 38 | inline void throw_if_exception(error_t status, std::string message = "") { 39 | if (status != cudaSuccess) 40 | throw exception_t(status, message); 41 | } 42 | 43 | inline void throw_if_exception(bool is_exception, std::string message = "") { 44 | if (is_exception) 45 | throw exception_t(message); 46 | } 47 | 48 | } // namespace error 49 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/memory.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file memory.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief 5 | * @version 0.1 6 | * @date 2022-02-03 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | namespace loops { 20 | namespace memory { 21 | 22 | /** 23 | * @brief memory space; cuda (device) or host. 24 | * Can be extended to support uvm and multi-gpu. 25 | * 26 | * @todo change this enum to support cudaMemoryType 27 | * (see ref; std::underlying_type::type) 28 | * instead of some random enums, we can rely 29 | * on cudaMemoryTypeHost/Device/Unregistered/Managed 30 | * for this. 31 | * 32 | */ 33 | enum memory_space_t { device, host, managed }; 34 | 35 | /** 36 | * @brief Wrapper around thrust::raw_pointer_cast() to accept .data() or raw 37 | * pointer and return a raw pointer. Useful when we would like to return a raw 38 | * pointer of either a thrust device vector or a host vector. Because thrust 39 | * device vector's raw pointer is accessed by `.data().get()`, whereas thrust 40 | * host vector's raw pointer is simply `data()`. So, when calling these 41 | * functions on `.data()`, it can cast either a host or device vector. 42 | * 43 | * @tparam type_t 44 | * @param pointer 45 | * @return type_t* 46 | */ 47 | template 48 | inline type_t* raw_pointer_cast(thrust::device_ptr pointer) { 49 | return thrust::raw_pointer_cast(pointer); 50 | } 51 | 52 | /** 53 | * @brief Wrapper around thrust::raw_pointer_cast() to accept .data() or raw 54 | * pointer and return a raw pointer. Useful when we would like to return a raw 55 | * pointer of either a thrust device vector or a host vector. Because thrust 56 | * device vector's raw pointer is accessed by `.data().get()`, whereas thrust 57 | * host vector's raw pointer is simply `data()`. So, when calling these 58 | * functions on `.data()`, it can cast either a host or device vector. 59 | * 60 | * @tparam type_t 61 | * @param pointer 62 | * @return type_t* 63 | */ 64 | template 65 | __host__ __device__ inline type_t* raw_pointer_cast(type_t* pointer) { 66 | return thrust::raw_pointer_cast(pointer); 67 | } 68 | 69 | } // namespace memory 70 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/range.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file range.hxx 3 | * @brief Code is based on Mark Harris' work: 4 | * https://github.com/harrism/cpp11-range/blob/master/range.hpp 5 | * @version 0.1 6 | * @date 2022-02-02 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | #pragma once 12 | 13 | #include 14 | #include 15 | 16 | namespace loops { 17 | namespace detail { 18 | 19 | template 20 | struct range_iter_base : std::iterator { 21 | __host__ __device__ range_iter_base(type_t current) : current(current) {} 22 | 23 | __host__ __device__ type_t operator*() const { return current; } 24 | 25 | __host__ __device__ type_t const* operator->() const { return ¤t; } 26 | 27 | __host__ __device__ range_iter_base& operator++() { 28 | ++current; 29 | return *this; 30 | } 31 | 32 | __host__ __device__ range_iter_base operator++(int) { 33 | auto copy = *this; 34 | ++*this; 35 | return copy; 36 | } 37 | 38 | __host__ __device__ bool operator==(range_iter_base const& other) const { 39 | return current == other.current; 40 | } 41 | 42 | __host__ __device__ bool operator!=(range_iter_base const& other) const { 43 | return !(*this == other); 44 | } 45 | 46 | protected: 47 | type_t current; 48 | }; 49 | 50 | } // namespace detail 51 | 52 | template 53 | struct range_proxy { 54 | struct iter : detail::range_iter_base { 55 | __host__ __device__ iter(type_t current) 56 | : detail::range_iter_base(current) {} 57 | }; 58 | 59 | struct step_range_proxy { 60 | struct iter : detail::range_iter_base { 61 | __host__ __device__ iter(type_t current, type_t step) 62 | : detail::range_iter_base(current), step(step) {} 63 | 64 | using detail::range_iter_base::current; 65 | 66 | __host__ __device__ iter& operator++() { 67 | current += step; 68 | return *this; 69 | } 70 | 71 | __host__ __device__ iter operator++(int) { 72 | auto copy = *this; 73 | ++*this; 74 | return copy; 75 | } 76 | 77 | // Loses commutativity. Iterator-based ranges are simply broken. 78 | __host__ __device__ bool operator==(iter const& other) const { 79 | return step > 0 ? current >= other.current : current < other.current; 80 | } 81 | 82 | __host__ __device__ bool operator!=(iter const& other) const { 83 | return !(*this == other); 84 | } 85 | 86 | private: 87 | type_t step; 88 | }; 89 | 90 | __host__ __device__ step_range_proxy(type_t begin, type_t end, type_t step) 91 | : begin_(begin, step), end_(end, step) {} 92 | 93 | __host__ __device__ iter begin() const { return begin_; } 94 | 95 | __host__ __device__ iter end() const { return end_; } 96 | 97 | private: 98 | iter begin_; 99 | iter end_; 100 | }; 101 | 102 | __host__ __device__ range_proxy(type_t begin, type_t end) 103 | : begin_(begin), end_(end) {} 104 | 105 | __host__ __device__ step_range_proxy step(type_t step) { 106 | return {*begin_, *end_, step}; 107 | } 108 | 109 | __host__ __device__ iter begin() const { return begin_; } 110 | 111 | __host__ __device__ iter end() const { return end_; } 112 | 113 | private: 114 | iter begin_; 115 | iter end_; 116 | }; 117 | 118 | template 119 | struct infinite_range_proxy { 120 | struct iter : detail::range_iter_base { 121 | __host__ __device__ iter(type_t current = type_t()) 122 | : detail::range_iter_base(current) {} 123 | 124 | __host__ __device__ bool operator==(iter const&) const { return false; } 125 | 126 | __host__ __device__ bool operator!=(iter const&) const { return true; } 127 | }; 128 | 129 | struct step_range_proxy { 130 | struct iter : detail::range_iter_base { 131 | __host__ __device__ iter(type_t current = type_t(), 132 | type_t step = type_t()) 133 | : detail::range_iter_base(current), step(step) {} 134 | 135 | using detail::range_iter_base::current; 136 | 137 | __host__ __device__ iter& operator++() { 138 | current += step; 139 | return *this; 140 | } 141 | 142 | __host__ __device__ iter operator++(int) { 143 | auto copy = *this; 144 | ++*this; 145 | return copy; 146 | } 147 | 148 | __host__ __device__ bool operator==(iter const&) const { return false; } 149 | 150 | __host__ __device__ bool operator!=(iter const&) const { return true; } 151 | 152 | private: 153 | type_t step; 154 | }; 155 | 156 | __host__ __device__ step_range_proxy(type_t begin, type_t step) 157 | : begin_(begin, step) {} 158 | 159 | __host__ __device__ iter begin() const { return begin_; } 160 | 161 | __host__ __device__ iter end() const { return iter(); } 162 | 163 | private: 164 | iter begin_; 165 | }; 166 | 167 | __host__ __device__ infinite_range_proxy(type_t begin) : begin_(begin) {} 168 | 169 | __host__ __device__ step_range_proxy step(type_t step) { 170 | return step_range_proxy(*begin_, step); 171 | } 172 | 173 | __host__ __device__ iter begin() const { return begin_; } 174 | 175 | __host__ __device__ iter end() const { return iter(); } 176 | 177 | private: 178 | iter begin_; 179 | }; 180 | 181 | template 182 | __host__ __device__ range_proxy range(type_t begin, type_t end) { 183 | return {begin, end}; 184 | } 185 | 186 | template 187 | __host__ __device__ infinite_range_proxy range(type_t begin) { 188 | return {begin}; 189 | } 190 | 191 | namespace traits { 192 | 193 | template 194 | struct has_size { 195 | template 196 | static constexpr auto check(type_t*) -> typename std::is_integral< 197 | decltype(std::declval().size())>::type; 198 | 199 | template 200 | static constexpr auto check(...) -> std::false_type; 201 | 202 | using type = decltype(check(0)); 203 | static constexpr bool value = type::value; 204 | }; 205 | 206 | } // namespace traits 207 | 208 | template ::value>> 210 | __host__ __device__ auto indices(C const& cont) 211 | -> range_proxy { 212 | return {0, cont.size()}; 213 | } 214 | 215 | template 216 | __host__ __device__ range_proxy indices(type_t (&)[N]) { 217 | return {0, N}; 218 | } 219 | 220 | template 221 | range_proxy::size_type> 222 | __host__ __device__ indices(std::initializer_list&& cont) { 223 | return {0, cont.size()}; 224 | } 225 | } // namespace loops -------------------------------------------------------------------------------- /include/loops/schedule.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file schedule.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Header file for the schedule class. 5 | * @version 0.1 6 | * @date 2022-02-04 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | 16 | namespace loops { 17 | namespace schedule { 18 | 19 | /** 20 | * @brief Load balancing algorithms. 21 | * 22 | */ 23 | enum algorithms_t { 24 | merge_path_flat, /// < Merge-path flat scheduling algorithm. 25 | work_oriented, /// < Work oriented scheduling algorithm. 26 | thread_mapped, /// < Thread mapped scheduling algorithm. 27 | group_mapped, /// < Group mapped scheduling algorithm. 28 | bucketing, /// < Bucketing scheduling algorithm. 29 | }; 30 | 31 | template 32 | class atom_traits; 33 | 34 | template 35 | class tile_traits; 36 | 37 | /** 38 | * @brief Schedule's setup interface. 39 | * 40 | * @tparam scheme The scheduling algorithm. 41 | * @tparam threads_per_block Number of threads per block. 42 | * @tparam threads_per_tile Number of threads per tile. 43 | * @tparam tiles_t Type of the tiles. 44 | * @tparam atoms_t Type of the atoms. 45 | * @tparam tile_size_t Type of the tile size (default: std::size_t). 46 | * @tparam atom_size_t Type of the atom size (default: std::size_t). 47 | */ 48 | template 55 | class setup; 56 | 57 | } // namespace schedule 58 | } // namespace loops 59 | 60 | #include 61 | #include 62 | #include 63 | #include -------------------------------------------------------------------------------- /include/loops/schedule/group_mapped.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file group_mapped.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief Group-mapped schedule (map work to tiles, process using individual 5 | * threads within the tile.) 6 | * @version 0.1 7 | * @date 2022-02-04 8 | * 9 | * @copyright Copyright (c) 2022 10 | * 11 | */ 12 | 13 | #pragma once 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | #ifndef _CG_ABI_EXPERIMENTAL 22 | #define _CG_ABI_EXPERIMENTAL 23 | #endif 24 | 25 | #include 26 | #include 27 | 28 | namespace loops { 29 | namespace schedule { 30 | 31 | namespace cg = cooperative_groups; 32 | #if CUDART_VERSION >= 12000 33 | namespace cg_x = cg; 34 | #else 35 | namespace cg_x = cooperative_groups::experimental; 36 | #endif 37 | 38 | template 39 | class atom_traits { 40 | public: 41 | using atoms_t = atoms_type; 42 | using atoms_iterator_t = atoms_t*; 43 | using atom_size_t = atom_size_type; 44 | 45 | __host__ __device__ atom_traits() : size_(0), atoms_(nullptr) {} 46 | __host__ __device__ atom_traits(atom_size_t size) 47 | : size_(size), atoms_(nullptr) {} 48 | __host__ __device__ atom_traits(atom_size_t size, atoms_iterator_t atoms) 49 | : size_(size), atoms_(atoms) {} 50 | 51 | __host__ __device__ atom_size_t size() const { return size_; } 52 | __host__ __device__ atoms_iterator_t begin() { return atoms_; }; 53 | __host__ __device__ atoms_iterator_t end() { return atoms_ + size_; }; 54 | 55 | private: 56 | atom_size_t size_; 57 | atoms_iterator_t atoms_; 58 | }; 59 | 60 | template 61 | class tile_traits { 62 | public: 63 | using tiles_t = tiles_type; 64 | using tiles_iterator_t = tiles_t*; 65 | using tile_size_t = tile_size_type; 66 | 67 | __host__ __device__ tile_traits() : size_(0), tiles_(nullptr) {} 68 | __host__ __device__ tile_traits(tile_size_t size, tiles_iterator_t tiles) 69 | : size_(size), tiles_(tiles) {} 70 | 71 | __host__ __device__ tile_size_t size() const { return size_; } 72 | __host__ __device__ tiles_iterator_t begin() { return tiles_; }; 73 | __host__ __device__ tiles_iterator_t end() { return tiles_ + size_; }; 74 | 75 | private: 76 | tile_size_t size_; 77 | tiles_iterator_t tiles_; 78 | }; 79 | 80 | template 86 | class setup : public tile_traits, 95 | public atom_traits { 98 | public: 99 | using tiles_t = tiles_type; 100 | using atoms_t = atoms_type; 101 | using tiles_iterator_t = tiles_t*; 102 | using atoms_iterator_t = atoms_t*; 103 | using tile_size_t = tile_size_type; 104 | using atom_size_t = atom_size_type; 105 | 106 | using tile_traits_t = 107 | tile_traits; 108 | using atom_traits_t = 109 | atom_traits; 110 | 111 | enum : unsigned int { 112 | threads_per_block = THREADS_PER_BLOCK, 113 | threads_per_tile = THREADS_PER_TILE, 114 | tiles_per_block = THREADS_PER_BLOCK / THREADS_PER_TILE, 115 | }; 116 | 117 | /// Temporary storage buffer for schedule algorithm. 118 | struct __align__(32) storage_t { 119 | #if CUDART_VERSION >= 12000 120 | cg_x::block_tile_memory groups; 121 | #else 122 | cg_x::block_tile_memory<4, threads_per_block> groups; 123 | #endif 124 | atoms_t tile_aggregates[threads_per_block / threads_per_tile]; 125 | atoms_t atoms_offsets[threads_per_block]; 126 | tiles_t tiles_indices[threads_per_block]; 127 | }; 128 | 129 | storage_t& buffer; 130 | 131 | /** 132 | * @brief Construct a setup object for load balance schedule. 133 | * 134 | * @param tiles Tiles iterator. 135 | * @param num_tiles Number of tiles. 136 | * @param num_atoms Number of atoms. 137 | */ 138 | __device__ __forceinline__ setup(storage_t& _buffer, 139 | tiles_iterator_t _tiles, 140 | tile_size_t _num_tiles, 141 | atom_size_t _num_atoms) 142 | : buffer(_buffer), 143 | tile_traits_t(_num_tiles, _tiles), 144 | atom_traits_t(_num_atoms) {} 145 | 146 | __device__ __forceinline__ auto partition() { 147 | auto g = cg::this_grid(); 148 | auto b = cg_x::this_thread_block(buffer.groups); 149 | auto p = cg_x::tiled_partition(b); 150 | 151 | auto index = g.thread_rank(); 152 | auto tile_index = p.thread_rank() + (p.meta_group_rank() * p.size()); 153 | if (index < tile_traits_t::size()) { 154 | buffer.tiles_indices[tile_index] = index; 155 | } else { 156 | buffer.tiles_indices[tile_index] = -1; 157 | } 158 | return p; 159 | } 160 | 161 | template 162 | __device__ step_range_t atom_accessor(partition_t& p) { 163 | atoms_t* p_st = 164 | buffer.atoms_offsets + (p.meta_group_rank() * threads_per_tile); 165 | auto g = cg::this_grid(); 166 | auto index = g.thread_rank(); 167 | atoms_t num_atoms = 0; 168 | if (index < tile_traits_t::size()) { 169 | num_atoms = 170 | tile_traits_t::begin()[index + 1] - tile_traits_t::begin()[index]; 171 | } 172 | 173 | p_st[p.thread_rank()] = cg::exclusive_scan(p, num_atoms); 174 | p.sync(); 175 | 176 | if (p.thread_rank() == p.size() - 1) { 177 | // Accumulate tiled aggregates. 178 | buffer.tile_aggregates[p.meta_group_rank()] = 179 | p_st[p.thread_rank()] + num_atoms; 180 | } 181 | 182 | p.sync(); 183 | atoms_t aggregate_atoms = buffer.tile_aggregates[p.meta_group_rank()]; 184 | return custom_stride_range(atoms_t(p.thread_rank()), aggregate_atoms, 185 | atoms_t(p.size())); 186 | } 187 | 188 | template 189 | __device__ __forceinline__ int get_length(partition_t& p) { 190 | auto g = cg::this_grid(); 191 | 192 | auto thread_id = g.thread_rank(); 193 | auto local_id = p.thread_rank(); 194 | 195 | int length = thread_id - local_id + p.size(); 196 | if (tile_traits_t::size() < length) 197 | length = tile_traits_t::size(); 198 | 199 | length -= thread_id - local_id; 200 | return length; 201 | } 202 | 203 | template 204 | __device__ __forceinline__ tiles_t tile_accessor(atoms_t& virtual_atom, 205 | partition_t& p) { 206 | int length = get_length(p); 207 | atoms_t* p_st = 208 | buffer.atoms_offsets + (p.meta_group_rank() * threads_per_tile); 209 | auto it = 210 | thrust::upper_bound(thrust::seq, p_st, p_st + length, virtual_atom); 211 | auto x = thrust::distance(p_st, it) - 1; 212 | return x; 213 | } 214 | 215 | template 216 | __device__ __forceinline__ bool is_valid_accessor(tiles_t& tile_id, 217 | partition_t& p) { 218 | return tile_id < get_length(p); 219 | } 220 | 221 | template 222 | __device__ __forceinline__ tiles_t tile_id(tiles_t& v_tile_id, 223 | partition_t& p) { 224 | return buffer.tiles_indices[v_tile_id + (p.meta_group_rank() * p.size())]; 225 | } 226 | 227 | template 228 | __device__ __forceinline__ atoms_t atom_id(atoms_t& v_atom, 229 | tiles_t& tile_id, 230 | tiles_t& v_tile_id, 231 | partition_t& p) { 232 | atoms_t* p_st = 233 | buffer.atoms_offsets + (p.meta_group_rank() * threads_per_tile); 234 | return tile_traits_t::begin()[tile_id] + v_atom - p_st[v_tile_id]; 235 | } 236 | 237 | }; // namespace schedule 238 | 239 | template 240 | using warp_mapped = 241 | setup; 242 | 243 | template 244 | using block_mapped = setup; 249 | 250 | } // namespace schedule 251 | } // namespace loops 252 | -------------------------------------------------------------------------------- /include/loops/schedule/thread_mapped.hxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file thread_mapped.hxx 3 | * @author Muhammad Osama (mosama@ucdavis.edu) 4 | * @brief 5 | * @version 0.1 6 | * @date 2022-02-04 7 | * 8 | * @copyright Copyright (c) 2022 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include