├── .codedocs ├── .github └── FUNDING.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── Modules │ ├── CompileWithWarnings.cmake │ ├── DocTest.cmake │ └── Findcuda-nvtx.cmake ├── docs └── cppreference-doxygen-web.tag.xml ├── doxygen.cfg ├── external └── doctest │ └── doctest.h ├── src ├── cuda-kat.cuh └── kat │ ├── common.hpp │ ├── containers │ ├── array.hpp │ ├── detail │ │ └── normal_iterator.hpp │ └── span.hpp │ ├── detail │ ├── constexpr_by_cpp_version.hpp │ ├── execution_space_specifiers.hpp │ ├── integer_sequence.hpp │ ├── pointers.cuh │ └── range_access.hpp │ ├── on_device │ ├── atomics.cuh │ ├── builtins.cuh │ ├── c_standard_library │ │ └── string.cuh │ ├── collaboration │ │ ├── block.cuh │ │ ├── grid.cuh │ │ └── warp.cuh │ ├── common.cuh │ ├── constexpr_math.cuh │ ├── detail │ │ ├── atomics.cuh │ │ ├── atomics │ │ │ └── missing_in_cuda.cuh │ │ ├── builtins.cuh │ │ ├── itoa.cuh │ │ └── shuffle.cuh │ ├── grid_info.cuh │ ├── math.cuh │ ├── miscellany.cuh │ ├── non-builtins.cuh │ ├── ptx.cuh │ ├── ptx │ │ ├── detail │ │ │ ├── define_macros.cuh │ │ │ └── undefine_macros.cuh │ │ ├── miscellany.cuh │ │ ├── special_registers.cuh │ │ └── video_instructions.cuh │ ├── sequence_ops │ │ ├── block.cuh │ │ ├── common.cuh │ │ ├── grid.cuh │ │ └── warp.cuh │ ├── shared_memory.cuh │ ├── shared_memory │ │ ├── basic.cuh │ │ └── operations.cuh │ ├── shuffle.cuh │ ├── streams │ │ ├── prefix_generators.cuh │ │ ├── printfing_ostream.cuh │ │ └── stringstream.cuh │ └── time.cuh │ ├── reference_wrapper.hpp │ ├── tuple.hpp │ └── utility.hpp └── tests ├── CMakeLists.txt ├── array.cu ├── atomics.cu ├── block_collaboration.cu ├── builtins.cu ├── c_string.cu ├── common.cuh ├── constexpr_math.cu ├── grid_collaboration.cu ├── math.cu ├── miscellany.cu ├── printing.cu ├── sequence_ops.cu ├── shared_memory.cu ├── shuffle.cu ├── span.cu ├── time.cu ├── tuple.cu ├── util ├── cpu_builtin_equivalents.hpp ├── macro.h ├── miscellany.cuh ├── poor_mans_constexpr_string.hpp ├── prettyprint.hpp ├── printing.hpp ├── random.cu ├── random.hpp ├── type_name.hpp ├── woodruff_int128_t.hpp └── woodruff_uint128_t.hpp └── warp_collaboration.cu /.codedocs: -------------------------------------------------------------------------------- 1 | # CodeDocs.xyz Configuration File 2 | # 3 | # Rename this example to '.codedocs' and put it in the root directory of your 4 | # repository. This file is optional, documentation will still be generated 5 | # without it using sensible defaults. 6 | 7 | #--------------------------------------------------------------------------- 8 | # CodeDocs Configuration 9 | #--------------------------------------------------------------------------- 10 | 11 | # Include the Doxygen configuration from another file. 12 | # The file must be a relative path with respect to the root of the repository. 13 | # If any of the options in this doxyfile include a path (ie, INPUT), these 14 | # paths will be considered relative to the root of the repository, not the 15 | # location of the DOXYFILE. 16 | 17 | DOXYFILE = doxygen.cfg 18 | 19 | # Specify external repository to link documentation with. 20 | # This is similar to Doxygen's TAGFILES option, but will automatically link to 21 | # tags of other repositories already using CodeDocs. List each repository to 22 | # link with by giving its location in the form of owner/repository. 23 | # For example: 24 | # TAGLINKS = doxygen/doxygen CodeDocs/osg 25 | # Note: these repositories must already be built on CodeDocs. 26 | 27 | TAGLINKS = 28 | 29 | #--------------------------------------------------------------------------- 30 | # Doxygen Configuration 31 | #--------------------------------------------------------------------------- 32 | 33 | # Doxygen configuration may also be placed in this file. 34 | # Currently, the following Doxygen configuration options are available. Refer 35 | # to http://doxygen.org/manual/config.html for detailed explanation of the 36 | # options. To request support for more options, contact support@codedocs.xyz. 37 | # 38 | # ABBREVIATE_BRIEF = 39 | # ALIASES = 40 | # ALLEXTERNALS = 41 | # ALLOW_UNICODE_NAMES = 42 | # ALPHABETICAL_INDEX = 43 | # ALWAYS_DETAILED_SEC = 44 | # AUTOLINK_SUPPORT = 45 | # BRIEF_MEMBER_DESC = 46 | # BUILTIN_STL_SUPPORT = 47 | # CALLER_GRAPH = 48 | # CALL_GRAPH = 49 | # CASE_SENSE_NAMES = 50 | # CITE_BIB_FILES = 51 | # CLASS_DIAGRAMS = 52 | # CLASS_GRAPH = 53 | # COLLABORATION_GRAPH = 54 | # COLS_IN_ALPHA_INDEX = 55 | # CPP_CLI_SUPPORT = 56 | # DIAFILE_DIRS = 57 | # DIRECTORY_GRAPH = 58 | # DISABLE_INDEX = 59 | # DISTRIBUTE_GROUP_DOC = 60 | # DOTFILE_DIRS = 61 | # DOT_FONTNAME = 62 | # DOT_FONTSIZE = 63 | # DOT_GRAPH_MAX_NODES = 64 | # DOT_IMAGE_FORMAT = 65 | # DOT_TRANSPARENT = 66 | # DOXYFILE_ENCODING = 67 | # ENABLED_SECTIONS = 68 | # ENABLE_PREPROCESSING = 69 | # ENUM_VALUES_PER_LINE = 70 | # EXAMPLE_PATH = 71 | # EXAMPLE_PATTERNS = 72 | # EXAMPLE_RECURSIVE = 73 | # EXCLUDE = 74 | # EXCLUDE_PATTERNS = 75 | # EXCLUDE_SYMBOLS = 76 | # EXPAND_AS_DEFINED = 77 | # EXPAND_ONLY_PREDEF = 78 | # EXTENSION_MAPPING = 79 | # EXTERNAL_GROUPS = 80 | # EXTERNAL_PAGES = 81 | # EXTRACT_ALL = 82 | # EXTRACT_ANON_NSPACES = 83 | # EXTRACT_LOCAL_CLASSES = 84 | # EXTRACT_LOCAL_METHODS = 85 | # EXTRACT_PACKAGE = 86 | # EXTRACT_PRIVATE = 87 | # EXTRACT_STATIC = 88 | # EXT_LINKS_IN_WINDOW = 89 | # FILE_PATTERNS = 90 | # FORCE_LOCAL_INCLUDES = 91 | # FORMULA_FONTSIZE = 92 | # FORMULA_TRANSPARENT = 93 | # FULL_PATH_NAMES = 94 | # GENERATE_BUGLIST = 95 | # GENERATE_DEPRECATEDLIST = 96 | # GENERATE_LEGEND = 97 | # GENERATE_TESTLIST = 98 | # GENERATE_TODOLIST = 99 | # GENERATE_TREEVIEW = 100 | # GRAPHICAL_HIERARCHY = 101 | # GROUP_GRAPHS = 102 | # GROUP_NESTED_COMPOUNDS = 103 | # HIDE_COMPOUND_REFERENCE= = 104 | # HIDE_FRIEND_COMPOUNDS = 105 | # HIDE_IN_BODY_DOCS = 106 | # HIDE_SCOPE_NAMES = 107 | # HIDE_UNDOC_CLASSES = 108 | # HIDE_UNDOC_MEMBERS = 109 | # HIDE_UNDOC_RELATIONS = 110 | # HTML_COLORSTYLE_GAMMA = 111 | # HTML_COLORSTYLE_HUE = 112 | # HTML_COLORSTYLE_SAT = 113 | # HTML_DYNAMIC_SECTIONS = 114 | # HTML_EXTRA_FILES = 115 | # HTML_EXTRA_STYLESHEET = 116 | # HTML_FOOTER = 117 | # HTML_HEADER = 118 | # HTML_INDEX_NUM_ENTRIES = 119 | # HTML_STYLESHEET = 120 | # HTML_TIMESTAMP = 121 | # IDL_PROPERTY_SUPPORT = 122 | # IGNORE_PREFIX = 123 | # IMAGE_PATH = 124 | # INCLUDED_BY_GRAPH = 125 | # INCLUDE_FILE_PATTERNS = 126 | # INCLUDE_GRAPH = 127 | # INCLUDE_PATH = 128 | # INHERIT_DOCS = 129 | # INLINE_GROUPED_CLASSES = 130 | # INLINE_INFO = 131 | # INLINE_INHERITED_MEMB = 132 | # INLINE_SIMPLE_STRUCTS = 133 | # INLINE_SOURCES = 134 | # INPUT = 135 | # INPUT_ENCODING = 136 | # INTERACTIVE_SVG = 137 | # INTERNAL_DOCS = 138 | # JAVADOC_AUTOBRIEF = 139 | # LAYOUT_FILE = 140 | # MACRO_EXPANSION = 141 | # MARKDOWN_SUPPORT = 142 | # MAX_DOT_GRAPH_DEPTH = 143 | # MSCFILE_DIRS = 144 | # MULTILINE_CPP_IS_BRIEF = 145 | # OPTIMIZE_FOR_FORTRAN = 146 | # OPTIMIZE_OUTPUT_FOR_C = 147 | # OPTIMIZE_OUTPUT_JAVA = 148 | # OPTIMIZE_OUTPUT_VHDL = 149 | # OUTPUT_LANGUAGE = 150 | # PLANTUML_JAR_PATH = 151 | # PREDEFINED = 152 | # PROJECT_BRIEF = 153 | # PROJECT_LOGO = 154 | # PROJECT_NAME = 155 | # PROJECT_NUMBER = 156 | # QT_AUTOBRIEF = 157 | # RECURSIVE = 158 | # REFERENCED_BY_RELATION = 159 | # REFERENCES_LINK_SOURCE = 160 | # REFERENCES_RELATION = 161 | # REPEAT_BRIEF = 162 | # SEARCHENGINE = 163 | # SEARCH_INCLUDES = 164 | # SEPARATE_MEMBER_PAGES = 165 | # SHORT_NAMES = 166 | # SHOW_FILES = 167 | # SHOW_GROUPED_MEMB_INC = 168 | # SHOW_INCLUDE_FILES = 169 | # SHOW_NAMESPACES = 170 | # SHOW_USED_FILES = 171 | # SIP_SUPPORT = 172 | # SKIP_FUNCTION_MACROS = 173 | # SORT_BRIEF_DOCS = 174 | # SORT_BY_SCOPE_NAME = 175 | # SORT_GROUP_NAMES = 176 | # SORT_MEMBERS_CTORS_1ST = 177 | # SORT_MEMBER_DOCS = 178 | # SOURCE_BROWSER = 179 | # SOURCE_TOOLTIPS = 180 | # STRICT_PROTO_MATCHING = 181 | # STRIP_CODE_COMMENTS = 182 | # STRIP_FROM_INC_PATH = 183 | # STRIP_FROM_PATH = 184 | # SUBGROUPING = 185 | # TAB_SIZE = 186 | # TEMPLATE_RELATIONS = 187 | # TREEVIEW_WIDTH = 188 | # TYPEDEF_HIDES_STRUCT = 189 | # UML_LIMIT_NUM_FIELDS = 190 | # UML_LOOK = 191 | # USE_MDFILE_AS_MAINPAGE = 192 | # VERBATIM_HEADERS = 193 | # 194 | 195 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | patreon: einpoklum 4 | custom: https://paypal.me/eyalroz 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | # Temporary, cache, swap files 4 | \#\#* 5 | *.swp 6 | *.bkp 7 | 8 | # Files which "ask" to be hidden 9 | *~ 10 | .* 11 | unused/ 12 | 13 | # Build artifacts 14 | *.a 15 | *.o 16 | *.so 17 | *.ptx 18 | bin/* 19 | lib/* 20 | build/* 21 | 22 | # Core dumps 23 | core 24 | core.* 25 | core-* 26 | 27 | # CMake & CTest-generated files 28 | CMakeCache.txt 29 | CMakeFiles/* 30 | cmake_install.cmake 31 | CMakeScripts/* 32 | CMakeTmp/* 33 | Makefile 34 | CTestTestfile.cmake 35 | 36 | # Eclise IDE-related files 37 | .project 38 | .cproject 39 | .settings 40 | 41 | # CLion IDE-related files 42 | .idea/ 43 | cmake-build-*/ 44 | 45 | # Patching 46 | *.diff 47 | *.rej 48 | *.orig 49 | 50 | # Files/folders downloaded from other repositories as part of the build 51 | external/* 52 | third-party/* 53 | 54 | # Miscellaneous 55 | tags 56 | log 57 | *.log 58 | *.v3breakpoints 59 | gmon.out 60 | .DS_Store 61 | 62 | # Doxygen 63 | doxygen.log 64 | Doxyfile 65 | docs/ 66 | 67 | # Archives 68 | *.zip 69 | *.gz 70 | *.bz2 71 | *.tgz 72 | *.tar 73 | *.xz 74 | 75 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Note: 3 | # 4 | # cuda-kat is a _header-only_ library. You can't build it, and you don't need 5 | # to run CMake in order to use it or install it. Just add the `src/` directory 6 | # to your include path (or copy its contents to some `include/` directory. 7 | # cuda-kat only depends on having a C++11 compiler and the CUDA toolkit 8 | # installed. 9 | # 10 | # This file is provided mostly in order to build the library unit tests. 11 | 12 | cmake_minimum_required(VERSION 3.8.2) 13 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules") 14 | 15 | project(cuda-kat 16 | DESCRIPTION "CUDA kernel author's tools" 17 | VERSION 0.1.0 18 | HOMEPAGE_URL "https://github.com/eyalroz/cuda-kat" 19 | LANGUAGES CXX CUDA) 20 | 21 | ############### 22 | ## Modules ## 23 | ############### 24 | 25 | # Standard CMake modules 26 | 27 | # Custom modules 28 | 29 | ############################ 30 | ## Package dependencies ## 31 | ############################ 32 | 33 | # cuda-kat can't use the standard library's string formatting and output stream code, 34 | # because most of it is host-side only; and it doesn't make sense to bundle a modified 35 | # half of the standard library just for that. Instead, we use the strf library 36 | # (available at: https://github.com/robhz786/strf ) 37 | find_package(strf 0.10.4) 38 | 39 | ############### 40 | ## OPTIONS ## 41 | ############### 42 | 43 | #message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 44 | 45 | set(BUILD_TESTS FALSE CACHE BOOL "Build tests for the library") 46 | 47 | ############### 48 | ## Targets ## 49 | ############### 50 | 51 | add_library(cuda-kat INTERFACE) 52 | target_include_directories( 53 | cuda-kat 54 | INTERFACE 55 | "$" 56 | "$" 57 | ) 58 | 59 | 60 | # TODO: Consider enabling the following command. It helps IDEs 61 | # notice the library's header files even if they're not currently 62 | # in use. 63 | # 64 | #target_sources(cuda-kat 65 | # src/kat/on_device/time.cuh 66 | # src/kat/on_device/shared_memory.cuh 67 | # etc. etc. 68 | 69 | 70 | ############# 71 | ## Tests ## 72 | ############# 73 | 74 | if(BUILD_TESTS) 75 | enable_testing() 76 | # set(TEST_RUNNER_PARAMS "--force-colors=true" CACHE STRING "Options to add to our test runners commands") 77 | add_subdirectory(tests) 78 | endif() 79 | 80 | 81 | #################### 82 | ## Installation ## 83 | #################### 84 | 85 | include(GNUInstallDirs) 86 | 87 | install( 88 | TARGETS cuda-kat 89 | EXPORT cuda-kat_export 90 | INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 91 | ) 92 | 93 | install( 94 | DIRECTORY src/kat 95 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 96 | FILES_MATCHING REGEX "\\.(h|hpp|cuh)$" 97 | ) 98 | 99 | install( 100 | EXPORT cuda-kat_export 101 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-kat" 102 | NAMESPACE "cuda-kat::" 103 | FILE cuda-kat-config.cmake 104 | ) 105 | 106 | include(CMakePackageConfigHelpers) 107 | 108 | write_basic_package_version_file( 109 | "cuda-kat-config-version.cmake" 110 | VERSION ${PROJECT_VERSION} 111 | COMPATIBILITY SameMinorVersion 112 | ) 113 | 114 | install( 115 | FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-kat-config-version.cmake" 116 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-kat" 117 | ) 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Eyal Rozenberg 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | -------------------------------------------------------------------------------- /cmake/Modules/CompileWithWarnings.cmake: -------------------------------------------------------------------------------- 1 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 2 | set(WARNING_FLAGS "-Wall -Wextra -Wpedantic -Wno-missing-field-initializers") 3 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 4 | set(WARNING_FLAGS "-Wall -Wextra -Wpedantic -Wno-missing-field-initializers") 5 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") 6 | set(WARNING_FLAGS "-w3 -wd1418,2259") 7 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") 8 | set(WARNING_FLAGS "/W4") 9 | else () 10 | message(WARNING "Unknown compiler - cannot set warning flags") 11 | endif() 12 | 13 | if(WARNING_FLAGS) 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") 15 | endif() 16 | 17 | -------------------------------------------------------------------------------- /cmake/Modules/DocTest.cmake: -------------------------------------------------------------------------------- 1 | add_library(doctest INTERFACE) 2 | set(DOCTEST_DIR "${PROJECT_SOURCE_DIR}/external/doctest/") 3 | target_sources(doctest INTERFACE ${DOCTEST_DIR}/doctest.h) # Is this needed? 4 | target_include_directories(doctest INTERFACE ${DOCTEST_DIR}) 5 | 6 | -------------------------------------------------------------------------------- /cmake/Modules/Findcuda-nvtx.cmake: -------------------------------------------------------------------------------- 1 | find_library(CUDA_NVTX_LIBRARY 2 | NAMES nvToolsExt nvTools nvtoolsext nvtools nvtx NVTX 3 | PATHS ${CUDA_TOOLKIT_ROOT_DIR} 4 | PATH_SUFFIXES "lib64" "common/lib64" "common/lib" "lib" 5 | DOC "Location of the CUDA Toolkit Extension (NVTX) library" 6 | ) 7 | mark_as_advanced(CUDA_NVTX_LIBRARY) 8 | 9 | -------------------------------------------------------------------------------- /src/cuda-kat.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file cuda-kat.cuh shortcut for including all functionality of the cuda-kat library. 3 | */ 4 | #ifndef CUDA_KAT_CUH_ 5 | #define CUDA_KAT_CUH_ 6 | 7 | #include "kat/containers/array.hpp" 8 | #include "kat/on_device/c_standard_library/string.cuh" 9 | #include "kat/on_device/constexpr_math.cuh" 10 | #include "kat/on_device/grid_info.cuh" 11 | #include "kat/on_device/math.cuh" 12 | #include "kat/on_device/miscellany.cuh" 13 | #include "kat/on_device/non-builtins.cuh" 14 | #include "kat/on_device/printing.cuh" 15 | #include "kat/on_device/ptx.cuh" 16 | #include "kat/on_device/shared_memory.cuh" 17 | #include "kat/on_device/unaligned.cuh" 18 | #include "kat/on_device/atomics.cuh" 19 | #include "kat/on_device/builtins.cuh" 20 | #include "kat/on_device/shuffle.cuh" 21 | #include "kat/on_device/collaboration/warp.cuh" 22 | #include "kat/on_device/collaboration/block.cuh" 23 | #include "kat/on_device/collaboration/grid.cuh" 24 | #include "kat/on_device/sequence_ops/warp.cuh" 25 | #include "kat/on_device/sequence_ops/block.cuh" 26 | 27 | #endif /* CUDA_KAT_CUH_ */ 28 | 29 | -------------------------------------------------------------------------------- /src/kat/common.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file kat/common.hpp 3 | * 4 | * @brief Basic type and macro definitions used throughout the KAT library. 5 | */ 6 | #pragma once 7 | #ifndef CUDA_KAT_COMMON_HPP_ 8 | #define CUDA_KAT_COMMON_HPP_ 9 | 10 | #include // for std::size_t 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | namespace kat { 17 | 18 | /** 19 | * Used throughout the kat library for (non-negative) sizes and lengths 20 | * of containers, memory regions and so on - on both the host and the device 21 | * side. 22 | * 23 | * @note CUDA isn't explicit about this, but it also uses the standard library's 24 | * size_t occasionally. 25 | */ 26 | using size_t = std::size_t; 27 | 28 | #if __cplusplus < 201703L 29 | 30 | // Some C++17 type traits definable in C++11 31 | 32 | template struct conjunction : std::true_type {}; 33 | template struct conjunction : B {}; 34 | template struct conjunction : std::conditional, B>::type {}; 35 | 36 | template struct disjunction : std::true_type {}; 37 | template struct disjunction : B {}; 38 | template struct disjunction : std::conditional, B>::type {}; 39 | 40 | template using bool_constant = std::integral_constant; 41 | 42 | template struct negation : bool_constant {}; 43 | 44 | #else 45 | 46 | template using conjunction = std::conjunction; 47 | template using disjunction = std::disjunction; 48 | template using bool_constant = std::bool_constant; 49 | template using negation = std::negation; 50 | 51 | 52 | #endif 53 | 54 | 55 | template 56 | using is_any_of = disjunction...>; 57 | 58 | /* 59 | template 60 | struct is_any_of 61 | : std::is_same {}; 62 | 63 | template 64 | struct is_any_of 65 | : bool_constant::value or std::is_same::value> {}; 66 | 67 | template 68 | struct is_any_of 69 | : bool_constant::value or std::is_same::value or std::is_same::value> {}; 70 | 71 | template 72 | struct is_any_of 73 | : bool_constant::value or is_any_of::value> {}; 74 | */ 75 | 76 | } // namespace kat 77 | 78 | #endif // CUDA_KAT_COMMON_HPP_ 79 | -------------------------------------------------------------------------------- /src/kat/detail/constexpr_by_cpp_version.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef CUDA_KAT_CONSTEXPR_BY_CPP_VERSION_HPP_ 3 | #define CUDA_KAT_CONSTEXPR_BY_CPP_VERSION_HPP_ 4 | 5 | ///@cond 6 | 7 | #if __cplusplus < 201103L 8 | #error "C++11 or newer is required to use this header" 9 | #endif 10 | 11 | #ifndef CONSTEXPR_SINCE_CPP_14 12 | #if __cplusplus >= 201402L 13 | #define CONSTEXPR_SINCE_CPP_14 constexpr 14 | #else 15 | #define CONSTEXPR_SINCE_CPP_14 16 | #endif 17 | #endif 18 | 19 | #ifndef CONSTEXPR_SINCE_CPP_17 20 | #if __cplusplus >= 201701L 21 | #define CONSTEXPR_SINCE_CPP_17 constexpr 22 | #else 23 | #define CONSTEXPR_SINCE_CPP_17 24 | #endif 25 | #endif 26 | 27 | ///@endcond 28 | 29 | #endif // CUDA_KAT_CONSTEXPR_BY_CPP_VERSION_HPP_ 30 | -------------------------------------------------------------------------------- /src/kat/detail/execution_space_specifiers.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file execution_space_specifiers.hpp 3 | * 4 | * @brief Some functions need a specification of their appropriate execution space w.r.t. the 5 | * CUDA device-vs-host-side side, as well as their inlining requirement. For brevity, 6 | * we introduce shorthands for these. 7 | */ 8 | 9 | #ifndef EXECUTION_SPACE_SPECIFIERS_HPP_ 10 | #define EXECUTION_SPACE_SPECIFIERS_HPP_ 11 | 12 | ///@cond 13 | 14 | #ifdef __CUDACC__ 15 | 16 | #ifndef KAT_FD 17 | #define KAT_FD __forceinline__ __device__ 18 | #endif 19 | 20 | #ifndef KAT_FH 21 | #define KAT_FH __forceinline__ __host__ 22 | #endif 23 | 24 | #ifndef KAT_FHD 25 | #define KAT_FHD __forceinline__ __host__ __device__ 26 | #endif 27 | 28 | #ifndef KAT_ID 29 | #define KAT_ID inline __device__ 30 | #endif 31 | 32 | #ifndef KAT_IH 33 | #define KAT_IH inline __host__ 34 | #endif 35 | 36 | #ifndef KAT_IHD 37 | #define KAT_IHD inline __host__ __device__ 38 | #endif 39 | 40 | #ifndef KAT_HD 41 | #define KAT_HD __host__ __device__ 42 | #endif 43 | 44 | #ifndef KAT_DEV 45 | #define KAT_DEV __device__ 46 | #endif 47 | 48 | #ifndef KAT_HOST 49 | #define KAT_HOST __host__ 50 | #endif 51 | 52 | #else // __CUDACC__ 53 | 54 | #ifndef KAT_FD 55 | #define KAT_FD inline 56 | #endif 57 | 58 | #ifndef KAT_FH 59 | #define KAT_FH inline 60 | #endif 61 | 62 | #ifndef KAT_FHD 63 | #define KAT_FHD inline 64 | #endif 65 | 66 | #ifndef KAT_ID 67 | #define KAT_ID inline 68 | #endif 69 | 70 | #ifndef KAT_IH 71 | #define KAT_IH inline 72 | #endif 73 | 74 | #ifndef KAT_IHD 75 | #define KAT_IHD inline 76 | #endif 77 | 78 | #ifndef KAT_HD 79 | #define KAT_HD 80 | #endif 81 | 82 | #ifndef KAT_DEV 83 | #define KAT_DEV 84 | #endif 85 | 86 | #ifndef KAT_HOST 87 | #define KAT_HOST 88 | #endif 89 | 90 | #endif // __CUDACC__ 91 | 92 | ///@endcond 93 | 94 | 95 | #endif // EXECUTION_SPACE_SPECIFIERS_HPP_ 96 | -------------------------------------------------------------------------------- /src/kat/detail/integer_sequence.hpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018 NVIDIA Corporation 3 | // Copyright (c) 2015-2018 Bryce Adelstein Lelbach aka wash 4 | // Copyright (c) 2020 Eyal Rozenberg 5 | // 6 | // Distributed under the Boost Software License, Version 1.0. (See copy 7 | // at http://www.boost.org/LICENSE_1_0.txt) 8 | /////////////////////////////////////////////////////////////////////////////// 9 | 10 | /** @file integer_sequence.hpp 11 | * 12 | * @brief An implementation of the C++14 standard library's `integer_sequence` 13 | * and associated helper aliases plus some extensions. Copied fromm NVIDIA's 14 | * thrust library's file `integer_sequence.h`, 2020-03-11. 15 | */ 16 | 17 | #ifndef CUDA_KAT_INTEGER_SEQUENCE_HPP_ 18 | #define CUDA_KAT_INTEGER_SEQUENCE_HPP_ 19 | 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | namespace kat { 27 | 28 | #if __cplusplus >= 201402L 29 | 30 | // A compile-time sequence of integral constants of type T. 31 | template 32 | using integer_sequence = std::integer_sequence; 33 | 34 | // A compile-time sequence of size_t constants. 35 | template 36 | using index_sequence = std::index_sequence; 37 | 38 | // Create a new integer_sequence with elements 0, 1, 2, ..., N - 1. 39 | template 40 | using make_integer_sequence = std::make_integer_sequence; 41 | 42 | // Create a new index_sequence with elements 0, 1, 2, ..., N - 1. 43 | template 44 | using make_index_sequence = std::make_index_sequence; 45 | 46 | /////////////////////////////////////////////////////////////////////////////// 47 | 48 | #else // Older than C++14. 49 | 50 | // A compile-time sequence of integral constants of type T. 51 | template 52 | struct integer_sequence; 53 | 54 | // A compile-time sequence of size_t constants. 55 | template 56 | using index_sequence = integer_sequence; 57 | 58 | /////////////////////////////////////////////////////////////////////////////// 59 | 60 | namespace detail 61 | { 62 | 63 | // Create a new integer_sequence containing the elements of Sequence0 followed 64 | // by the elements of Sequence1. Sequence0::size() is added to each element from 65 | // Sequence1 in the new sequence. 66 | template 67 | struct merge_and_renumber_integer_sequences_impl; 68 | template 69 | using merge_and_renumber_integer_sequences = 70 | typename merge_and_renumber_integer_sequences_impl< 71 | Sequence0, Sequence1 72 | >::type; 73 | 74 | // Create a new integer_sequence with elements 0, 1, 2, ..., N - 1. 75 | template 76 | struct make_integer_sequence_impl; 77 | 78 | 79 | } // namespace detail 80 | 81 | /////////////////////////////////////////////////////////////////////////////// 82 | 83 | // Create a new integer_sequence with elements 0, 1, 2, ..., N - 1. 84 | template 85 | using make_integer_sequence = 86 | typename detail::make_integer_sequence_impl::type; 87 | 88 | // Create a new index_sequence with elements 0, 1, 2, ..., N - 1. 89 | template 90 | using make_index_sequence = 91 | make_integer_sequence; 92 | 93 | /////////////////////////////////////////////////////////////////////////////// 94 | 95 | template 96 | struct integer_sequence 97 | { 98 | using type = integer_sequence; 99 | using value_type = T; 100 | using size_type = size_t; 101 | 102 | KAT_HD 103 | static constexpr size_type size() noexcept 104 | { 105 | return sizeof...(Is); 106 | } 107 | }; 108 | /////////////////////////////////////////////////////////////////////////////// 109 | 110 | namespace detail 111 | { 112 | 113 | template 114 | struct merge_and_renumber_integer_sequences_impl< 115 | integer_sequence, integer_sequence 116 | > 117 | { 118 | using type = integer_sequence; 119 | }; 120 | 121 | /////////////////////////////////////////////////////////////////////////////// 122 | 123 | template 124 | struct make_integer_sequence_impl 125 | { 126 | using type = merge_and_renumber_integer_sequences< 127 | make_integer_sequence 128 | , make_integer_sequence 129 | >; 130 | }; 131 | 132 | template 133 | struct make_integer_sequence_impl 134 | { 135 | using type = integer_sequence; 136 | }; 137 | 138 | template 139 | struct make_integer_sequence_impl 140 | { 141 | using type = integer_sequence; 142 | }; 143 | 144 | } // namespace detail 145 | 146 | #endif // THRUST_CPP_DIALECT >= 2014 147 | 148 | /////////////////////////////////////////////////////////////////////////////// 149 | 150 | namespace detail 151 | { 152 | 153 | // Create a new integer_sequence containing the elements of Sequence0 followed 154 | // by the elements of Sequence1. Sequence1::size() is added to each element from 155 | // Sequence0 in the new sequence. 156 | template 157 | struct merge_and_renumber_reversed_integer_sequences_impl; 158 | template 159 | using merge_and_renumber_reversed_integer_sequences = 160 | typename merge_and_renumber_reversed_integer_sequences_impl< 161 | Sequence0, Sequence1 162 | >::type; 163 | 164 | // Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0. 165 | template 166 | struct make_reversed_integer_sequence_impl; 167 | 168 | // Add a new element to the front of an integer_sequence<>. 169 | template 170 | struct integer_sequence_push_front_impl; 171 | 172 | // Add a new element to the back of an integer_sequence<>. 173 | template 174 | struct integer_sequence_push_back_impl; 175 | 176 | } 177 | 178 | /////////////////////////////////////////////////////////////////////////////// 179 | 180 | // Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0. 181 | template 182 | using make_reversed_integer_sequence = 183 | typename detail::make_reversed_integer_sequence_impl::type; 184 | 185 | // Create a new index_sequence with elements N - 1, N - 2, N - 3, ..., 0. 186 | template 187 | using make_reversed_index_sequence = 188 | make_reversed_integer_sequence; 189 | 190 | // Add a new element to the front of an integer_sequence<>. 191 | template 192 | using integer_sequence_push_front = 193 | typename detail::integer_sequence_push_front_impl::type; 194 | 195 | // Add a new element to the back of an integer_sequence<>. 196 | template 197 | using integer_sequence_push_back = 198 | typename detail::integer_sequence_push_back_impl::type; 199 | 200 | /////////////////////////////////////////////////////////////////////////////// 201 | 202 | namespace detail 203 | { 204 | 205 | template 206 | struct merge_and_renumber_reversed_integer_sequences_impl< 207 | integer_sequence, integer_sequence 208 | > 209 | { 210 | using type = integer_sequence; 211 | }; 212 | 213 | /////////////////////////////////////////////////////////////////////////////// 214 | 215 | template 216 | struct make_reversed_integer_sequence_impl 217 | { 218 | using type = merge_and_renumber_reversed_integer_sequences< 219 | make_reversed_integer_sequence 220 | , make_reversed_integer_sequence 221 | >; 222 | }; 223 | 224 | /////////////////////////////////////////////////////////////////////////////// 225 | 226 | template 227 | struct make_reversed_integer_sequence_impl 228 | { 229 | using type = integer_sequence; 230 | }; 231 | 232 | template 233 | struct make_reversed_integer_sequence_impl 234 | { 235 | using type = integer_sequence; 236 | }; 237 | 238 | /////////////////////////////////////////////////////////////////////////////// 239 | 240 | template 241 | struct integer_sequence_push_front_impl > 242 | { 243 | using type = integer_sequence; 244 | }; 245 | 246 | /////////////////////////////////////////////////////////////////////////////// 247 | 248 | template 249 | struct integer_sequence_push_back_impl > 250 | { 251 | using type = integer_sequence; 252 | }; 253 | 254 | /////////////////////////////////////////////////////////////////////////////// 255 | 256 | } // namespace detail 257 | 258 | } // namespace kat 259 | 260 | #endif // CUDA_KAT_INTEGER_SEQUENCE_HPP_ 261 | 262 | -------------------------------------------------------------------------------- /src/kat/detail/pointers.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef CUDA_KAT_POINTERS_CUH_ 3 | #define CUDA_KAT_POINTERS_CUH_ 4 | 5 | #include 6 | 7 | 8 | ///@cond 9 | #include 10 | ///@endcond 11 | 12 | namespace kat { 13 | namespace detail { 14 | 15 | static constexpr auto obj_ptr_size { sizeof(void *) }; 16 | static constexpr auto fun_ptr_size { sizeof(void (*)()) }; 17 | //auto dat_mem_ptr_size = sizeof(generic_dat_mem_ptr_t); 18 | //auto mem_fun_size = sizeof(generic_mem_fun_ptr_t); 19 | 20 | //auto max_ptr_size = std::max({ sizeof(generic_obj_ptr_t), sizeof(generic_fun_ptr_t), sizeof(generic_dat_mem_ptr_t), sizeof(generic_mem_fun_ptr_t) }); 21 | //auto max_ptr_align = std::max({ alignof(generic_obj_ptr_t), alignof(generic_fun_ptr_t), alignof(generic_dat_mem_ptr_t), alignof(generic_mem_fun_ptr_t) }); 22 | 23 | static constexpr auto max_ptr_size { (obj_ptr_size > fun_ptr_size) ? obj_ptr_size : fun_ptr_size }; 24 | 25 | static_assert(max_ptr_size == sizeof(uint64_t), "Unexpected maximum pointer size"); 26 | 27 | using address_t = uint64_t; 28 | 29 | //KAT_FHD address_t address_as_number (address_t address) { return address; } 30 | template 31 | constexpr KAT_FHD address_t address_as_number (const T* address) noexcept { return reinterpret_cast(address); } 32 | template 33 | constexpr KAT_FHD T* address_as_pointer(address_t address) noexcept { return reinterpret_cast(address); } 34 | 35 | template 36 | KAT_FHD std::ptrdiff_t address_difference(T1* p1, T2* p2) 37 | { 38 | return address_as_number(p1) - address_as_number(p2); 39 | } 40 | 41 | 42 | // TODO: Code duplication with math.cuh 43 | template 44 | constexpr KAT_FHD bool is_power_of_2(I val) { return (val & (val-1)) == 0; } 45 | 46 | template 47 | constexpr KAT_FHD address_t misalignment_extent(address_t address) noexcept 48 | { 49 | static_assert(is_power_of_2(sizeof(T)),"Invalid type for alignment"); 50 | constexpr address_t mask = sizeof(T) - 1; // utilizing the fact that it's a power of 2 51 | return address & mask; 52 | } 53 | 54 | /** 55 | * Computes the number of bytes by which a pointer is misaligned. 56 | * 57 | * @tparam T The pointer-to element type; its size must be a power of 2. 58 | * 59 | * @param ptr The possibly-misaligned pointer 60 | * @return the minimum number of bytes which, if deducted from ptr, produces 61 | * a T-aligned pointer 62 | */ 63 | template 64 | constexpr KAT_FHD address_t misalignment_extent(const U* ptr) noexcept 65 | { 66 | return misalignment_extent(address_as_number(ptr)); 67 | } 68 | 69 | template 70 | constexpr KAT_FHD bool is_aligned(const U* ptr) noexcept 71 | { 72 | return misalignment_extent(reinterpret_cast(ptr)) == 0; 73 | } 74 | 75 | template 76 | constexpr KAT_FHD bool is_aligned(address_t address) noexcept 77 | { 78 | return misalignment_extent(address) == 0; 79 | } 80 | 81 | template 82 | constexpr KAT_FHD address_t align_down(address_t address) noexcept 83 | { 84 | return (address - misalignment_extent(address)); 85 | } 86 | 87 | template 88 | constexpr KAT_FHD address_t align_up(address_t address) noexcept 89 | { 90 | return address + is_aligned(address) ? 0 : misalignment_extent(address); 91 | } 92 | 93 | /** 94 | * @tparam T a type whose size is a power of 2 (and thus has natural alignment) 95 | * @param A possibly-unaligned pointer 96 | * @return A pointer to the closest aligned T in memory upto and including @p ptr 97 | */ 98 | template 99 | KAT_FHD AlignBy* align_down(T* ptr) noexcept 100 | { 101 | // Note: The compiler _should_ optimize out the inefficiency of using 102 | // misalignment_extent rather than just applying a mask once. 103 | address_t address = address_as_number(ptr); 104 | auto aligned_addr = align_down(address); 105 | return (AlignBy*) address_as_pointer(aligned_addr); 106 | } 107 | 108 | template 109 | KAT_FHD AlignBy* align_up(T* ptr) noexcept 110 | { 111 | address_t address = address_as_number(ptr); 112 | auto aligned_addr = align_up(address); 113 | return (AlignBy*) address_as_pointer(aligned_addr); 114 | } 115 | 116 | template 117 | KAT_FHD T* align_down(T* ptr) noexcept 118 | { 119 | return const_cast(align_down(reinterpret_cast(ptr))); 120 | } 121 | 122 | template 123 | KAT_FHD T* align_up(T* ptr) noexcept 124 | { 125 | return const_cast(align_up(reinterpret_cast(ptr))); 126 | } 127 | 128 | 129 | } // namespace detail 130 | } // namespace kat 131 | 132 | #include 133 | 134 | #endif // CUDA_KAT_POINTERS_CUH_ 135 | -------------------------------------------------------------------------------- /src/kat/on_device/atomics.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/common.cuh 3 | * 4 | * @brief Type-generic wrappers for CUDA atomic operations 5 | * 6 | * CUDA's atomic "primitive" atomic functions are non-generic C functions, 7 | * defined only for some specific types - and sometimes only for some of the 8 | * types of the same size for which semantics are identical. In this file 9 | * are found type-generic variants of these same function, with functionality 10 | * extended as much as possible - either through recasting or using 11 | * the compare-and-swap (compare-and-exchange) primitive to implement other 12 | * functions for types not directly supported. 13 | * 14 | * Additionally, the wrapper used for emulating atomics on arbitrary types 15 | * is made available here for the user to be able to do the same for 16 | * arbitrary functions. 17 | * 18 | * @note nVIDIA makes a rather unfortunate and non-intuitive choice of parameter 19 | * names for its atomic functions, which - at least for now, and for the sake of 20 | * consistency - I adopt: they call a pointer an "address", and they call the 21 | * new value "val" even if there is another value to consider (e.g. atomicCAS). 22 | * Also, what's with the shorthand? Did you run out of disk space? :-( 23 | */ 24 | 25 | #ifndef CUDA_KAT_ON_DEVICE_ATOMICS_CUH_ 26 | #define CUDA_KAT_ON_DEVICE_ATOMICS_CUH_ 27 | 28 | #include 29 | 30 | ///@cond 31 | #include 32 | ///@endcond 33 | 34 | namespace kat { 35 | namespace atomic { 36 | 37 | template KAT_FD T add (T* address, T val); 38 | template KAT_FD T subtract (T* address, T val); 39 | template KAT_FD T exchange (T* address, T val); 40 | template KAT_FD T min (T* address, T val); 41 | template KAT_FD T max (T* address, T val); 42 | template KAT_FD T logical_and(T* address, T val); 43 | template KAT_FD T logical_or (T* address, T val); 44 | template KAT_FD T logical_not(T* address); 45 | template KAT_FD T logical_xor(T* address, T val); 46 | template KAT_FD T bitwise_or (T* address, T val); 47 | template KAT_FD T bitwise_and(T* address, T val); 48 | template KAT_FD T bitwise_xor(T* address, T val); 49 | template KAT_FD T bitwise_not(T* address); 50 | template KAT_FD T set_bit (T* address, native_word_t bit_index); 51 | template KAT_FD T unset_bit (T* address, native_word_t bit_index); 52 | /** 53 | * @brief Increment the value at @p address by 1 - but if it reaches or surpasses @p wraparound_value, set it to 0. 54 | * 55 | * @note repeated invocations of this function will cycle through the range of values 0... @p wraparound_values - 1; thus 56 | * as long as the existing value is within that range, this is a simple incrementation modulo @p wraparound_value. 57 | */ 58 | template KAT_FD T increment (T* address, T modulus = std::numeric_limits::max()); 59 | /** 60 | * @brief Decrement the value at @p address by 1 - but if it reaches 0, or surpasses @p wraparound_value, it is set 61 | * to @p wrarparound_value - 1. 62 | * 63 | * @note repeated invocations of this function will cycle backwards through the range of values 0... 64 | * @p wraparound_values - 1; thus as long as the existing value is within that range, this is a simple decrementation 65 | * modulo @p wraparound_value. 66 | */ 67 | template KAT_FD T decrement (T* address, T modulus = std::numeric_limits::max()); 68 | 69 | 70 | // Note: We let this one take a const reference 71 | template KAT_FD T compare_and_swap( 72 | T* address, 73 | const T compare, 74 | const T val); 75 | 76 | 77 | /** 78 | * Use atomic compare-and-swap to apply a unary function to some value, 79 | * replacing it at its memory location with the result before anything 80 | * else changes it. 81 | * 82 | * @return The new value which was stored in memory 83 | */ 84 | template 85 | KAT_FD T apply_atomically(UnaryFunction f, T* address); 86 | 87 | /** 88 | * Use atomic compare-and-swap to apply a binary function to two values, 89 | * replacing the first at its memory location with the result before anything 90 | * else changes it. 91 | * 92 | * @return The new value which was stored in memory 93 | */ 94 | template 95 | KAT_FD T apply_atomically( 96 | Function f, 97 | T* __restrict__ address, 98 | const Ts... xs); 99 | 100 | 101 | } // namespace atomic 102 | } // namespace kat 103 | 104 | #include "detail/atomics.cuh" 105 | 106 | #endif // CUDA_KAT_ON_DEVICE_ATOMICS_CUH_ 107 | -------------------------------------------------------------------------------- /src/kat/on_device/collaboration/grid.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/collaboration/grid.cuh 3 | * 4 | * @brief CUDA device computation grid-level primitives, i.e. those involving 5 | * interaction of threads from different blocks in the grid 6 | * 7 | */ 8 | 9 | #pragma once 10 | #ifndef CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_GRID_CUH_ 11 | #define CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_GRID_CUH_ 12 | 13 | #include "warp.cuh" 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | 22 | ///@cond 23 | #include 24 | ///@endcond 25 | 26 | namespace kat { 27 | namespace linear_grid { 28 | namespace collaborative { 29 | namespace grid { 30 | 31 | // If we want to refer to other primitives, we'll make those references explicit; 32 | // but we do want to be able to say `warp::index()` without prefixing that with anything. 33 | 34 | namespace grid = kat::linear_grid::grid_info::grid; 35 | namespace block = kat::linear_grid::grid_info::block; 36 | namespace warp = kat::linear_grid::grid_info::warp; 37 | namespace thread = kat::linear_grid::grid_info::thread; 38 | namespace lane = kat::linear_grid::grid_info::lane; 39 | 40 | /** 41 | * Have all kernel threads perform some action over the linear range 42 | * of 0..length-1, at strides equal to the grid length, i.e. a thread 43 | * with index i_t in block with index i_b, where block lengths are n_b, 44 | * will perform the action on elements i_t, i_t + n_b, i_t + 2*n_b, and 45 | * so on. 46 | * 47 | * Thus, if in the following chart the rectangles represent 48 | * consecutive segments of n_b integers, the numbers 49 | * indicate which blocks work on which elements in "grid stride": 50 | * 51 | * ------------------------------------------------------- 52 | * | 1 | 222 | 333 | 1 | 222 | 333 | 1 | 53 | * | 11 | 2 2 | 3 3 | 11 | 2 2 | 3 3 | 11 | 54 | * | 1 | 2 | 3 | 1 | 2 | 3 | 1 | 55 | * | 1 | 222 | 3 | 1 | 222 | 3 | 1 | 56 | * | 1 | 2 | 3 3 | 1 | 2 | 3 3 | 1 | 57 | * | 111 | 22222 | 333 | 111 | 22222 | 333 | 111 | 58 | * ------------------------------------------------------- 59 | * 60 | * (the grid is 3 blocks' worth, so block 1 strides 3 blocks 61 | * from one sequence of indices it processes to the next.) 62 | * This is unlike `at_block_stride()`, for which instead 63 | * of 1, 2, 3, 1, 2, 3, 1 we would have 1, 1, 1, 2, 2, 2, 3 64 | * (or 1, 1, 2, 2, 3, 3, 4 if the grid has 4 blocks). 65 | * 66 | * @note assumes the number of grid threads is fixed (does that 67 | * always hold? even with dynamic parallelism?) 68 | * 69 | * @param length The length of the range (of integers) on which to act 70 | * @param f The callable to call for each element of the sequence. 71 | */ 72 | template 73 | KAT_FD void at_grid_stride(Size length, const Function& f) 74 | { 75 | auto num_grid_threads = grid::num_threads(); 76 | for(promoted_size_t pos = thread::global_id(); 77 | pos < length; 78 | pos += num_grid_threads) 79 | { 80 | f(pos); 81 | } 82 | } 83 | 84 | namespace warp_per_input_element { 85 | 86 | /** 87 | * A variant of the one-position-per-thread applicator, 88 | * `collaborative::grid::at_grid_stride()`: Here each warp works on one 89 | * input position, advancing by 'grid stride' in the sense of total 90 | * warps in the grid. 91 | * 92 | * @note it is assumed the grid only has fully-active warps; any 93 | * possibly-inactive threads are not given consideration. 94 | * 95 | * @note This version of `at_grid_stride` is specific to linear grids, 96 | * even though the text of its code looks the same as that of 97 | * @ref kat::grid_info::collaborative::warp::at_grid_stride . 98 | * 99 | * @param length The length of the range of positions on which to act 100 | * @param f The callable for warps to use each position in the sequence 101 | */ 102 | template 103 | KAT_FD void at_grid_stride(Size length, const Function& f) 104 | { 105 | auto num_warps_in_grid = grid_info::grid::num_warps(); 106 | for(// _not_ the global thread index! - one element per warp 107 | promoted_size_t pos = grid_info::warp::global_id(); 108 | pos < length; 109 | pos += num_warps_in_grid) 110 | { 111 | f(pos); 112 | } 113 | } 114 | 115 | 116 | } // namespace warp_per_input_element 117 | 118 | 119 | /** 120 | * Have all grid threads perform some action over the linear range 121 | * of 0..length-1, with each thread acting on a fixed number of items 122 | * (@p the serialization_factor) at at stride of the block length, 123 | * i.e. a thread with index i_t in 124 | * block with index i_b, where block lengths are n_b, 125 | * will perform the action on elements 126 | * 127 | * n_b * i_b * serialization_factor + i_t, 128 | * (n_b * i_b + 1) * serialization_factor + i_t, 129 | * (n_b * i_b + 2) * serialization_factor + i_t, 130 | * 131 | * and so on. For lengths which are not divisible by n_b * 132 | * serialization_factor, threads in the last block will 133 | * work on less items. 134 | * 135 | * Thus, if in the following chart the rectangles represent 136 | * consecutive segments of n_b integers, the numbers 137 | * indicate which blocks work on which elements in "block stride": 138 | * 139 | * ------------------------------------------------------- 140 | * | 1 | 1 | 222 | 222 | 333 | 333 | 4 | 141 | * | 11 | 11 | 2 2 | 2 2 | 3 3 | 3 3 | 44 | 142 | * | 1 | 1 | 2 | 2 | 3 | 3 | 4 4 | 143 | * | 1 | 1 | 222 | 222 | 3 | 3 | 4 4 | 144 | * | 1 | 1 | 2 | 2 | 3 3 | 3 3 | 44444 | 145 | * | 111 | 111 | 22222 | 22222 | 333 | 333 | 4 | 146 | * ------------------------------------------------------- 147 | * 148 | * (A block strides from one blocks' worth of indices to the next.) 149 | * This is unlike `at_grid_stride()`, for which instead 150 | * of 1, 1, 2, 2, 3, 3, 4 we would have 1, 2, 3, 1, 2, 3, 1 (if the 151 | * grid has 3 blocks) or 1, 2, 3, 4, 1, 2 (if the grid has 4 blocks). 152 | * 153 | * @note Theoretically, the @param serialization_factor value could be 154 | * computed by this function itself. This is avoided, assuming that's 155 | * been take care of before. Specifically, we assume that the 156 | * @param serialization_factor is no higher than it absolutely 157 | * must be. 158 | * 159 | * @note There's a block-level variant of this primitive, but there - 160 | * each block applies f to the _same_ range of elements, rather than 161 | * covering part of a larger range. 162 | * 163 | * @note This implementation does not handle cases of overflow of 164 | * the @tparam Size type, e.g. if your Size is uint32_t and @param 165 | * length is close to 2^32 - 1, the function may fail. 166 | * 167 | * @note There's a tricky tradeoff here between avoiding per-iteration 168 | * checks for whether we're past the end, and avoiding too many 169 | * initial checks. Two of the the template parameters help up avoid 170 | * this tradeoff in certain cases by not having to check explicitly 171 | * for things. 172 | * 173 | * 174 | * @param length The length of the range (of integers) on which to act 175 | * @param serialization_factor the number of elements each thread is to 176 | * handle (serially) 177 | * @param f The callable to execute for each element of the sequence. 178 | * 179 | */ 180 | template < 181 | typename Function, 182 | typename Size = size_t, 183 | bool AssumeLengthIsDivisibleByBlockSize = false, 184 | bool GridMayFullyCoverLength = true, 185 | typename SerializationFactor = unsigned> 186 | KAT_FD void at_block_stride( 187 | Size length, 188 | const Function& f, 189 | SerializationFactor serialization_factor) 190 | { 191 | auto block_length = block::length(); 192 | auto num_elements_to_process_by_each_block = serialization_factor * block_length; 193 | Size block_start_pos = num_elements_to_process_by_each_block * block::index(); 194 | Size pos = block_start_pos + thread::index(); 195 | if (pos >= length) { return; } 196 | auto in_last_acting_block = (block_start_pos + num_elements_to_process_by_each_block >= length); 197 | // Note: Be careful about overflow in this last line, if block_start_pos is close 198 | // to the maximum value of Size. 199 | 200 | if (in_last_acting_block) { 201 | #pragma unroll 202 | for(; pos < length; pos += block_length) { 203 | f(pos); 204 | } 205 | return; 206 | } 207 | // If we're not in the last block which needs to take any action, we assume that we'll perform 208 | // full iterations and don't need to check for overstepping any bounds 209 | #pragma unroll 210 | for(SerializationFactor i = 0; i < serialization_factor; i++, pos += block_length) { 211 | f(pos); 212 | } 213 | } 214 | 215 | } // namespace grid 216 | } // namespace collaborative 217 | } // namespace linear_grid 218 | } // namespace kat 219 | 220 | #endif // CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_GRID_CUH_ 221 | -------------------------------------------------------------------------------- /src/kat/on_device/common.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/common.cuh 3 | * 4 | * @brief Some basic type and constant definitions used by all 5 | * device-side CUDA KAT code 6 | * 7 | */ 8 | #pragma once 9 | #ifndef CUDA_KAT_ON_DEVICE_COMMON_CUH_ 10 | #define CUDA_KAT_ON_DEVICE_COMMON_CUH_ 11 | 12 | #include 13 | #include // for CHAR_BIT 14 | #include 15 | 16 | ///@cond 17 | #include 18 | ///@endcond 19 | 20 | namespace kat { 21 | 22 | /** 23 | * CUDA kernels are launched in grids of blocks of threads, in 3 dimensions. 24 | * In each of these, the numbers of blocks per grid is specified in this type. 25 | * 26 | * @note Theoretically, CUDA could split the type for blocks per grid and 27 | * threads per block, but for now they're the same. 28 | * 29 | * @note All three dimensions in dim3 are of the same type as dim3::x 30 | */ 31 | using grid_dimension_t = decltype(dim3::x); 32 | 33 | /** 34 | * CUDA kernels are launched in grids of blocks of threads, in 3 dimensions. 35 | * In each of these, the number of threads per block is specified in this type. 36 | * 37 | * @note Theoretically, CUDA could split the type for blocks per grid and 38 | * threads per block, but for now they're the same. 39 | */ 40 | using grid_block_dimension_t = grid_dimension_t; 41 | 42 | using native_word_t = unsigned; // TODO: Make this uint32_t perhaps? 43 | enum : native_word_t { warp_size = 32 }; 44 | enum : native_word_t { log_warp_size = 5 }; 45 | 46 | /** 47 | * @brief a size type no smaller than a native word. 48 | * 49 | * Sometimes, in device code, we only need our size type to cover a small 50 | * range of values; but - it is still more effective to use a full native word, 51 | * rather than to risk extra instructions to enforce the limits of 52 | * sub-native-word values. And while it's true this might not help much, 53 | * or be optimized away - let's be on the safe side anyway. 54 | */ 55 | template 56 | using promoted_size_t = typename std::common_type::type; 57 | 58 | /** 59 | * A mask with one bit for each lane in a warp. Used to indicate which threads 60 | * meet a certain criterion or need to have some action applied to them. 61 | * 62 | * @todo: Consider using a 32-bit bit field 63 | */ 64 | using lane_mask_t = unsigned; 65 | 66 | enum : lane_mask_t { 67 | full_warp_mask = 0xFFFFFFFF, //!< Bits turned on for all lanes in thw warp 68 | empty_warp_mask = 0x0, //!< Bits turned on for all lanes in thw warp 69 | }; 70 | 71 | 72 | /** 73 | * The number bits in the representation of a value of type T. 74 | * 75 | * @note with this variant, you'll need to manually specify the type. 76 | */ 77 | template 78 | constexpr std::size_t size_in_bits() { return sizeof(T) * CHAR_BIT; } 79 | 80 | //constexpr KAT_FHD bool operator==(const dim3& lhs, const dim3& rhs) 81 | //{ 82 | // return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z; 83 | //} 84 | 85 | /** 86 | * The number bits in the representation of a value of type T 87 | * 88 | * @note with this variant, the type will be deduced from the 89 | * object you pass. 90 | */ 91 | template 92 | constexpr std::size_t size_in_bits(const T&) { return sizeof(T) * CHAR_BIT; } 93 | 94 | 95 | /* 96 | namespace detail { 97 | 98 | ** 99 | * Use CUDA intrinsics where possible and relevant to reinterpret the bits 100 | * of values of different types 101 | * 102 | * @param x[in] the value to reinterpret. No references please! 103 | * @return the reinterpreted value 104 | * 105 | template 106 | KAT_FD Interpreted reinterpret( 107 | typename std::enable_if< 108 | !std::is_same< 109 | typename std::decay::type, // I actually just don't want references here 110 | typename std::decay::type>::value && // I actually just don't want references here 111 | sizeof(ToInterpret) == sizeof(Interpreted), ToInterpret>::type x) 112 | { 113 | return x; 114 | } 115 | 116 | template<> KAT_FD double reinterpret(long long int x) { return __longlong_as_double(x); } 117 | template<> KAT_FD long long int reinterpret(double x) { return __double_as_longlong(x); } 118 | 119 | template<> KAT_FD double reinterpret(unsigned long long int x) { return __longlong_as_double(x); } 120 | template<> KAT_FD unsigned long long int reinterpret(double x) { return __double_as_longlong(x); } 121 | 122 | template<> KAT_FD float reinterpret(int x) { return __int_as_float(x); } 123 | template<> KAT_FD int reinterpret(float x) { return __float_as_int(x); } 124 | 125 | } // namespace detail 126 | */ 127 | 128 | /** 129 | * @note Interpreted can be either a value or a reference type. 130 | * 131 | * @todo Would it be better to return a reference? 132 | */ 133 | template 134 | KAT_FHD Interpreted reinterpret(Original& x) 135 | { 136 | return reinterpret_cast(x); 137 | } 138 | 139 | } // namespace kat 140 | 141 | #endif // CUDA_KAT_ON_DEVICE_COMMON_CUH_ 142 | -------------------------------------------------------------------------------- /src/kat/on_device/detail/atomics/missing_in_cuda.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2019, Eyal Rozenberg 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * * Redistributions of source code must retain the above copyright notice, 10 | * this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of CWI Amsterdam nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | * POSSIBILITY OF SUCH DAMAGE. 29 | * 30 | */ 31 | #pragma once 32 | #ifndef CUDA_CUDA_KAT_ON_DEVICE_ATOMICS_MISSING_FROM_CUDA_CUH_ 33 | #define CUDA_CUDA_KAT_ON_DEVICE_ATOMICS_MISSING_FROM_CUDA_CUH_ 34 | 35 | #include 36 | #include 37 | 38 | static_assert(sizeof (unsigned long int) == sizeof(unsigned long long int), "Unexpected size of unsigned long int"); 39 | 40 | // Annoyingly, CUDA - upto and including version 10.2 - provide atomic 41 | // operation wrappers for unsigned int and unsigned long long int, but 42 | // not for the in-between type of unsigned long int. Also, some atomic 43 | // operations are provided for ints, i.e. not just for unsigned types, 44 | // but not consistently, i.e. int yes, long long int no, despite being 45 | // provided for unsigned long long int. So - we have to fill the gap. 46 | // 47 | // TODO: On CUDA devices, sizeof(long) is 8, like sizeof(long long). However, 48 | // that's not true on Windows host-side code. Need to double check this discrepancy 49 | // doesn't mess this code's correctness up somehow. 50 | 51 | #define CUDA_KAT_DEFINE_MISSING_ATOMIC(arg_type, op) \ 52 | KAT_FD arg_type atomic ## op(arg_type *address, arg_type val) \ 53 | { \ 54 | static_assert(sizeof(long) == sizeof(int) or sizeof(long) == sizeof(long long int), "Unexpected sizeof(long)"); \ 55 | return (sizeof(arg_type) == sizeof(unsigned int)) ? \ 56 | ::atomic ## op(reinterpret_cast(address), reinterpret_cast(val)) : \ 57 | ::atomic ## op(reinterpret_cast(address), reinterpret_cast(val)); \ 58 | } 59 | 60 | #define CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(op) \ 61 | CUDA_KAT_DEFINE_MISSING_ATOMIC(unsigned long, op) \ 62 | CUDA_KAT_DEFINE_MISSING_ATOMIC(long, op) \ 63 | CUDA_KAT_DEFINE_MISSING_ATOMIC(long long, op) 64 | 65 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Add) 66 | #if CUDA_ARCH >= 320 67 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(And) 68 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Or) 69 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Xor) 70 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Min) 71 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Max) 72 | #endif 73 | 74 | #undef CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP 75 | #undef CUDA_KAT_DEFINE_MISSING_ATOMIC 76 | 77 | #endif // CUDA_CUDA_KAT_ON_DEVICE_ATOMICS_MISSING_FROM_CUDA_CUH_ 78 | -------------------------------------------------------------------------------- /src/kat/on_device/detail/itoa.cuh: -------------------------------------------------------------------------------- 1 | #ifndef KAT_ON_DEVICE_DETAIL_ITOA_CUH_ 2 | #define KAT_ON_DEVICE_DETAIL_ITOA_CUH_ 3 | 4 | #include 5 | 6 | namespace kat { 7 | namespace detail { 8 | 9 | template struct max_num_digits { }; 10 | template <> struct max_num_digits { static constexpr const unsigned value { 3 }; }; 11 | template <> struct max_num_digits { static constexpr const unsigned value { 5 }; }; 12 | template <> struct max_num_digits { static constexpr const unsigned value { 10 }; }; 13 | template <> struct max_num_digits { static constexpr const unsigned value { 20 }; }; 14 | 15 | template 16 | inline KAT_DEV unsigned integer_to_string_reversed(I value, char* buffer) 17 | { 18 | bool append_minus { 19 | #pragma push 20 | #pragma diag_suppress = unsigned_compare_with_zero 21 | std::is_signed::value and (value < 0) 22 | #pragma pop 23 | }; 24 | value = builtins::absolute_value(value); 25 | 26 | char *reverse_ptr = buffer; 27 | do { 28 | *reverse_ptr++ = '0' + (value % 10); 29 | value /= 10; 30 | } while (value > 0); 31 | 32 | if (append_minus) { *reverse_ptr++ = '-'; } 33 | return reverse_ptr - buffer; 34 | } 35 | 36 | inline KAT_DEV char* copy_in_reverse(char* dst, const char* src, std::size_t length) 37 | { 38 | for(auto i = 0; i < length; i++) { 39 | dst[i] = src[length - i - 1]; 40 | } 41 | return dst; 42 | } 43 | 44 | // This is not supposed to be optimal, just a straightforward short implementation 45 | template 46 | inline KAT_DEV unsigned integer_to_string(I value, char* buffer) 47 | { 48 | using unsigned_type = typename std::make_unsigned::type; 49 | char reverse_buffer[max_num_digits::value]; 50 | auto length = integer_to_string_reversed(value, reverse_buffer); 51 | copy_in_reverse(buffer, reverse_buffer, length); 52 | if (WriteTermination) { buffer[length] = '\0'; } 53 | return length; 54 | } 55 | 56 | } // namespace detail 57 | } // namespace kat 58 | 59 | 60 | #endif // KAT_ON_DEVICE_DETAIL_ITOA_CUH_ 61 | -------------------------------------------------------------------------------- /src/kat/on_device/math.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/math.cuh 3 | * 4 | * @brief Templatized mathematical function definitions for integer and floating-point types. 5 | * 6 | * CUDA has many mathematical primitives - which are already found in @ref `builtins.cuh`. 7 | * However, they are often not defined for all types; and - some functions are missing 8 | * (e.g. @ref `gcd()`) or can benefit from specialization (e.g. division by a power of 2). 9 | * This file has the wider selection of functions, utilizing a primitive (from `builtins::`) 10 | * when relevant, and multi-instruction implementation otherwise. 11 | * 12 | * @note Including this file is sufficient for accessing all functions in 13 | * @ref `constexpr_math.cuh`. 14 | */ 15 | #pragma once 16 | #ifndef CUDA_KAT_ON_DEVICE_MATH_CUH_ 17 | #define CUDA_KAT_ON_DEVICE_MATH_CUH_ 18 | 19 | #include "common.cuh" 20 | #include "constexpr_math.cuh" 21 | #include 22 | 23 | 24 | ///@cond 25 | #include 26 | ///@endcond 27 | 28 | #include 29 | 30 | namespace kat { 31 | 32 | /** 33 | * @brief compute the base-two logarithm of a number known to be a power of 2. 34 | * 35 | * @note Yes, this is trivial to do, but: 36 | * 1. This says _what_ you're doing, not _how_ you do it (e.g. left-shifting 37 | * bits and such) 38 | * 2. There's a device-side optimization here (which isn't constexpr) 39 | * 40 | * @param p an integral power of 2 41 | * @return the exponent l such than 2^l equals p 42 | */ 43 | template 44 | KAT_FD unsigned log2_of_power_of_2(I p) 45 | { 46 | static_assert(std::is_integral::value, "Only supported for integers"); 47 | // Remember 0 is _not_ a power of 2. 48 | return builtins::population_count(p - 1); 49 | } 50 | 51 | /** 52 | * A variant of `div_rounding_up` (which you can find in `constexpr_math.cuh`), 53 | * which has (non-constexpr, unfortunately) optimizations based on the knowledge 54 | * the divisor is a power of 2 55 | * 56 | * @return The smallest multiple of divisor above dividend / divisor 57 | */ 58 | template 59 | KAT_FD T div_by_power_of_2_rounding_up(const T& dividend, const S& divisor) 60 | { 61 | auto mask = divisor - 1; // Remember: 0 is _not_ a power of 2 62 | auto log_2_of_divisor = log2_of_power_of_2(divisor); 63 | auto correction_for_rounding_up = (dividend & mask != 0); 64 | 65 | return (dividend >> log_2_of_divisor) + correction_for_rounding_up; 66 | } 67 | 68 | 69 | template 70 | constexpr KAT_FD I div_by_power_of_2(I dividend, P power_of_2) 71 | { 72 | return dividend >> log2_of_power_of_2(power_of_2); 73 | } 74 | 75 | 76 | 77 | #if __cplusplus < 201402L 78 | /** 79 | * @brief compute the greatest common divisor (gcd) of two values. 80 | * 81 | * @param u One integral value (prefer making this the larger one) 82 | * @param v Another integral value (prefer making this the smaller one) 83 | * @return the largest I value d such that d divides @p u and d divides @p v. 84 | */ 85 | template 86 | constexpr KAT_FD T gcd(T u, T v) 87 | { 88 | static_assert(std::is_integral::value, "Only supported for integers"); 89 | while (v != 0) { 90 | T r = u % v; 91 | u = v; 92 | v = r; 93 | } 94 | return u; 95 | } 96 | // ... and for C++14, this is a constexpr_ implementation, and we don't need to redo it here 97 | #endif 98 | 99 | /** 100 | * @brief compute the least common multiple (LCM) of two integer values 101 | * 102 | * @tparam I an integral (or integral-number-like) type 103 | * 104 | * @param u One of the numbers which the result must divide 105 | * @param v Another one of the numbers which the result must divide 106 | * @return The highest I value which divides both @p u and @p v. 107 | */ 108 | template 109 | KAT_FD I lcm(I u, I v) 110 | { 111 | static_assert(std::is_integral::value, "Only supported for integers at the moment"); 112 | return (u / gcd(u,v)) * v; 113 | } 114 | 115 | namespace detail { 116 | 117 | 118 | template KAT_FD int count_leading_zeros(I x) 119 | { 120 | static_assert(std::is_integral::value, "Only integral types are supported"); 121 | static_assert(sizeof(I) <= sizeof(long long), "Unexpectedly large type"); 122 | 123 | using native_clz_type = 124 | typename std::conditional< sizeof(I) <= sizeof(int), int, long long >::type; 125 | enum : int { width_difference_in_bits = (sizeof(native_clz_type) - sizeof(I)) * CHAR_BIT }; 126 | return builtins::count_leading_zeros(static_cast(x)) - width_difference_in_bits; 127 | } 128 | 129 | } 130 | 131 | /** 132 | * @brief compute the (integral) base-two logarithm of a number 133 | * 134 | * @note Yes, this is trivial to do, but: 135 | * 1. This says _what_ you're doing, not _how_ you do it (e.g. left-shifting 136 | * bits and such) 137 | * 2. There's a device-side optimization here (which isn't constexpr) 138 | * 139 | * @param x a non-negative value 140 | * @return floor(log2(x)), i.e. the least exponent l such than 2^l >= x 141 | */ 142 | template 143 | KAT_FD unsigned log2(I x) { 144 | assert(x > 0); 145 | return I{CHAR_BIT * sizeof(I) - I{1} } - detail::count_leading_zeros(x); 146 | } 147 | 148 | namespace detail { 149 | 150 | template KAT_FD T minimum(std::integral_constant, T x, T y) 151 | { 152 | return x < y ? x : y; 153 | } 154 | 155 | template KAT_FD T minimum(std::integral_constant, T x, T y) 156 | { 157 | return builtins::minimum(x, y); 158 | } 159 | 160 | 161 | template KAT_FD T maximum(std::integral_constant, T x, T y) 162 | { 163 | return x > y ? x : y; 164 | } 165 | 166 | template KAT_FD T maximum(std::integral_constant, T x, T y) 167 | { 168 | return builtins::maximum(x, y); 169 | } 170 | 171 | template KAT_FD T absolute_value(std::integral_constant, T x) 172 | { 173 | return (std::is_unsigned::value or x >= 0) ? x : -x; 174 | } 175 | 176 | template KAT_FD T absolute_value(std::integral_constant, T x) 177 | { 178 | return builtins::absolute_value(x); 179 | } 180 | 181 | } // namespace detail 182 | 183 | template KAT_FD T minimum(T x, T y) 184 | { 185 | // TODO: Check at compile-time whether the builtin is instantiated or not - without duplication the list of types here 186 | return detail::minimum(std::integral_constant::value or 188 | std::is_same::value or 189 | std::is_same::value or 190 | std::is_same::value or 191 | std::is_same::value or 192 | std::is_same::value or 193 | std::is_same::value or 194 | std::is_same::value>{}, 195 | x, y); 196 | } 197 | 198 | template KAT_FD T maximum(T x, T y) 199 | { 200 | // TODO: Check at compile-time whether the builtin is instantiated or not - without duplication the list of types here 201 | return detail::maximum(std::integral_constant::value or 203 | std::is_same::value or 204 | std::is_same::value or 205 | std::is_same::value or 206 | std::is_same::value or 207 | std::is_same::value or 208 | std::is_same::value or 209 | std::is_same::value>{}, 210 | x, y); 211 | } 212 | 213 | template KAT_FD T absolute_value(T x) 214 | { 215 | // TODO: Check at compile-time whether the builtin is instantiated or not - without duplication the list of types here 216 | return detail::absolute_value(std::integral_constant::value or 218 | std::is_same::value or 219 | std::is_same::value or 220 | std::is_same::value or 221 | std::is_same::value or 222 | std::is_same::value>{}, 223 | x); 224 | } 225 | 226 | } // namespace kat 227 | 228 | #endif // CUDA_KAT_ON_DEVICE_MATH_CUH_ 229 | -------------------------------------------------------------------------------- /src/kat/on_device/miscellany.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file kat/on_device/miscellany.cuh 3 | * 4 | * @brief Miscellaneous functions provided by cuda-kat which are not a good 5 | * fit in any other header. 6 | */ 7 | #pragma once 8 | #ifndef CUDA_KAT_ON_DEVICE_MISCELLANY_CUH_ 9 | #define CUDA_KAT_ON_DEVICE_MISCELLANY_CUH_ 10 | 11 | #include "common.cuh" 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | ///@cond 19 | #include 20 | ///@endcond 21 | 22 | namespace kat { 23 | 24 | namespace detail { 25 | 26 | template struct integer_type_struct; 27 | template <> struct integer_type_struct { using type = std::uint8_t; }; 28 | template <> struct integer_type_struct { using type = std::uint16_t; }; 29 | template <> struct integer_type_struct { using type = std::uint32_t; }; 30 | template <> struct integer_type_struct { using type = std::uint64_t; }; 31 | template <> struct integer_type_struct { using type = std::int8_t; }; 32 | template <> struct integer_type_struct { using type = std::int16_t; }; 33 | template <> struct integer_type_struct { using type = std::int32_t; }; 34 | template <> struct integer_type_struct { using type = std::int64_t; }; 35 | 36 | // TODO: Consider pushing these types upwards into kat:: proper. 37 | 38 | /** 39 | * A templating by size of the signed integer types 40 | */ 41 | template 42 | using int_t = typename detail::integer_type_struct::type; 43 | 44 | /** 45 | * A templating by size of the unsigned integer types 46 | */ 47 | template 48 | using uint_t = typename detail::integer_type_struct::type; 49 | 50 | 51 | /** 52 | * @note Assumes num_elements_to_copy > 0 and the same misalignment of the source 53 | * and destination w.r.t. native words. 54 | */ 55 | KAT_FD void copy( 56 | uint32_t* __restrict__ destination, 57 | const uint32_t* __restrict__ source, 58 | std::size_t num_elements_to_copy) 59 | { 60 | while (num_elements_to_copy-- > 0) { 61 | *(destination++) = *(source++); 62 | } 63 | } 64 | 65 | /** 66 | * @note Assumes num_elements_to_copy > 0 and the same misalignment of the source 67 | * and destination w.r.t. native words. 68 | */ 69 | KAT_FD void copy( 70 | uint16_t* __restrict__ destination, 71 | const uint16_t* __restrict__ source, 72 | std::size_t num_elements_to_copy) 73 | { 74 | bool got_non_word_head = not is_aligned(destination); 75 | if (got_non_word_head) { 76 | *(destination++) = *(source++); 77 | num_elements_to_copy--; 78 | } 79 | auto num_words_to_copy = 80 | num_elements_to_copy / ((sizeof(native_word_t) / sizeof(uint16_t))); 81 | // ... so, half as many words as elements; 82 | detail::copy( 83 | reinterpret_cast(destination), 84 | reinterpret_cast(source), 85 | num_words_to_copy 86 | ); 87 | bool got_non_word_tail = not is_aligned(destination + num_elements_to_copy); 88 | if (got_non_word_tail) { 89 | destination[num_elements_to_copy - 1] = source[num_elements_to_copy - 1]; 90 | } 91 | } 92 | 93 | /** 94 | * @note Assumes num_elements_to_copy > 0 and the same misalignment of the source 95 | * and destination w.r.t. native words. 96 | */ 97 | KAT_FD void copy( 98 | uint8_t* __restrict__ destination, 99 | const uint8_t* __restrict__ source, 100 | std::size_t num_elements_to_copy) 101 | { 102 | // TODO: Improve this implementation to use native-word copies as much as possible, just like the 2-byte case 103 | if (num_elements_to_copy > 0) { 104 | ::memcpy(destination, source, num_elements_to_copy * sizeof(uint8_t)); 105 | } 106 | } 107 | 108 | } // namespace detail 109 | 110 | /** 111 | * Copies some data from one location to another - using the native register 112 | * size for individual elements on CUDA GPUs, i.e. sizeof(int) = 4 113 | * 114 | * @note CUDA's own general-purpose memcpy() takes void pointers and uses a u8 (byte) 115 | * LD-ST loop. See: @url https://godbolt.org/z/9ChTPM ; this LD-ST's using the native 116 | * register size, 4 bytes, if possible. 117 | * 118 | * @note this function assumes appropriate alignment. 119 | * 120 | * @note Instead of using this function, you're probably better off using a warp-level 121 | * or block-level primitive for copying data. 122 | * 123 | * @param destination Destination of the copy. Must have at least 124 | * 4 (@p num_elements_to_copy} bytes allocated. Data must be self-aligned, i.e. the 125 | * numeric value of this parameter must be divisible by sizeof(T). 126 | * @param source The beginning of the memory region from which to copy. 127 | * There must be sizeof(T) * {@p num_elements_to_copy} bytes readable starting with 128 | * this address. Data must be self-aligned, i.e. the numeric value of this parameter 129 | * must be divisible by sizeof(T). 130 | * @param num_elements_to_copy the number of elements of data to copy - not their 131 | * total size in bytes! 132 | * @return the destination pointer 133 | */ 134 | template 135 | KAT_FD T* copy( 136 | T* __restrict__ destination, 137 | const T* __restrict__ source, 138 | std::size_t num_elements_to_copy) 139 | { 140 | // This function uses the native word size explicitly in a few places, so: 141 | static_assert(sizeof(native_word_t) == sizeof(uint32_t), "unexpected size of native word"); 142 | 143 | if (not std::is_trivially_copyable::value) { 144 | // Can't optimize, must use T::operator=. 145 | for(std::size_t i = 0; i < num_elements_to_copy; i++) { 146 | destination[i] = source[i]; 147 | } 148 | return destination; 149 | } 150 | 151 | if (not AssumeSameAlignmentWithinWord) { 152 | auto source_misalignent_in_bytes = detail::misalignment_extent(source); 153 | auto destination_misalignent_in_bytes = detail::misalignment_extent(destination); 154 | if (source_misalignent_in_bytes != destination_misalignent_in_bytes) { 155 | // Since the alignments don't match, any read-and-write operation pair 156 | // will be unaligned - unless we work on individual bytes. 157 | 158 | if (num_elements_to_copy > 0) { 159 | ::memcpy(destination, source, num_elements_to_copy * sizeof(T)); 160 | } 161 | return destination; 162 | // ... but actually the above claim is not true, for the case of 2-byte size_mod; 163 | // if the alignments are, say, 0 and 2 or 3 and 1 then we can at least use a 164 | // loop over 2-byte copying. TODO: Implement that. 165 | } 166 | } 167 | 168 | if (num_elements_to_copy == 0) { 169 | return destination; 170 | } 171 | 172 | constexpr const auto size_mod_in_bytes { sizeof(T) % sizeof(native_word_t) }; 173 | constexpr const auto size_gcd_of_T_and_native_word { 174 | size_mod_in_bytes == 4 ? 0 : (size_mod_in_bytes == 2 ? 2 : 1) 175 | }; 176 | using copy_unit_type = detail::uint_t; 177 | auto num_copy_unit_elements_to_copy = num_elements_to_copy * sizeof(T) / sizeof(copy_unit_type); 178 | 179 | detail::copy( 180 | reinterpret_cast(destination), 181 | reinterpret_cast(source), 182 | num_copy_unit_elements_to_copy 183 | ); 184 | return destination; 185 | } 186 | 187 | /** 188 | * @brief Return the number of full warps in a linear grid 189 | * which would, overall, contain at least a given number of threads. 190 | * 191 | * @note This comes in handy more times than you must expect even in device-side code. 192 | * 193 | * @note the reason this function is defined directly rather than using 194 | * the functions in math or constexpr_math is that bit-counting is 195 | * either slow in run-time on the GPUwhen you use the constexpr way of 196 | * doing it, or not constexpr if you use the GPU-side population count 197 | * instruction. 198 | */ 199 | template 200 | constexpr KAT_FHD I num_warp_sizes_to_cover(I number_of_threads) 201 | { 202 | static_assert(std::is_integral::value, "Number of threads specified using a non-integral type"); 203 | enum : I { mask = (warp_size - 1) }; 204 | enum : I { log_warp_size = 5 } ; 205 | return (number_of_threads >> log_warp_size) + ((number_of_threads & mask) != 0); 206 | } 207 | 208 | } // namespace kat 209 | 210 | #endif // CUDA_KAT_ON_DEVICE_MISCELLANY_CUH_ 211 | -------------------------------------------------------------------------------- /src/kat/on_device/non-builtins.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/non-builtins.cuh 3 | * 4 | * @brief Templated, uniformly-named C++ functions wrapping what should 5 | * have been single PTX - but aren't (in a dedicated `non_builtins` namespace). 6 | * 7 | * There are several functions one would expect would compile to single PTX 8 | * instructions (Similar ones _do_ compile to single PTX instructions, 9 | * and on the CPU, they themselves often translate to a single machine 10 | * instruction) - but strangely, they do not. Implementations of such functions 11 | * are found in this file rather than in @ref `on_device/builtins.cuh`; and they 12 | * get a different namespace to avoid accidental confusion. 13 | * 14 | */ 15 | #ifndef CUDA_KAT_ON_DEVICE_NON_BUILTINS_CUH_ 16 | #define CUDA_KAT_ON_DEVICE_NON_BUILTINS_CUH_ 17 | 18 | #include 19 | 20 | 21 | ///@cond 22 | #include 23 | ///@endcond 24 | 25 | namespace kat { 26 | namespace non_builtins { 27 | 28 | /** 29 | * @brief Determine the 1-based index of the first non-zero bit in the argument. 30 | * 31 | * @param x the value to be considered as a container of bits 32 | * @return If @p x is 0, returns 0; otherwise, returns the 1-based index of the 33 | * first non-zero bit in @p x 34 | */ 35 | template KAT_FD int find_first_set(I x) 36 | { 37 | static_assert(std::is_integral::value, "Only integral types are supported"); 38 | static_assert(sizeof(I) <= sizeof(long long), "Unexpectedly large type"); 39 | 40 | using ffs_type = typename std::conditional< sizeof(I) <= sizeof(int), int, long long >::type; 41 | return find_first_set(x); 42 | } 43 | template <> KAT_FD int find_first_set< int >(int x) { return __ffs(x); } 44 | template <> KAT_FD int find_first_set< long long >(long long x) { return __ffsll(x); } 45 | 46 | /** 47 | * @brief counts the number of initial zeros when considering the binary representation 48 | * of a number from least to most significant digit 49 | * 50 | * @tparam FixSemanticsForZero the simpler implementation of this function uses the 51 | * @ref `find_first_set()` builtin. Unfortunately, that one returns -1 rather than 0 52 | * if no bits are set. Fixing this requires a couple of extra instructions. By default, 53 | * we'll use them, but one might be interested just skipping them and taking -1 54 | * instead of 32 (= warp_size) for the no-1's case. 55 | * 56 | * @param x the number whose binary representation is to be counted 57 | * @return the number of initial zero bits before the first 1; if x is 0, the full 58 | * number of bits is returned (or -1, depending on @tparam FixSemanticsForZero). 59 | */ 60 | template 61 | KAT_FD int count_trailing_zeros(I x) 62 | { 63 | if (FixSemanticsForZero and x == 0) { 64 | return size_in_bits(); 65 | } 66 | return find_first_set(x) - 1; 67 | } 68 | 69 | /** 70 | * @brief counts the number of initial zeros when considering the binary representation 71 | * of a number from most to least significant digit 72 | * @param x the number whose representation is to be counted 73 | * @return the counted number of 0 bits; if x is 0, 32 is returned 74 | */ 75 | template KAT_FD int count_leading_zeros(I x) 76 | { 77 | static_assert(std::is_integral::value, "Only integral types are supported"); 78 | static_assert(sizeof(I) <= sizeof(long long), "Unexpectedly large type"); 79 | 80 | using native_clz_type = 81 | typename std::conditional< sizeof(I) <= sizeof(int), int, long long >::type; 82 | enum : int { width_difference_in_bits = (sizeof(native_clz_type) - sizeof(I)) * CHAR_BIT }; 83 | return builtins::count_leading_zeros(static_cast(x)) - width_difference_in_bits; 84 | } 85 | 86 | } // namespace non_builtins 87 | } // namespace kat 88 | 89 | #endif // CUDA_KAT_ON_DEVICE_NON_BUILTINS_CUH_ 90 | -------------------------------------------------------------------------------- /src/kat/on_device/ptx.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/ptx.cuh 3 | * 4 | * @brief Wrapper functions for single PTX instructions --- using inline PTX 5 | * assembly --- which are not already available in the official CUDA includes 6 | * 7 | * CUDA provides many "intrinsics" functions, which wrap single PTX instructions, 8 | * e.g. `__ldg` or `__funnelshift_l` from `sm_32_intrinsics.h`. But - CUDA 9 | * doesn't provide such functions for all of the PTX instruction set. The 10 | * files included from this master-include contain such single-line assembly 11 | * wrapper functions for different categories of missing PTX instructions. 12 | * 13 | * @note Unlike @ref `on_device/builtins.cuh`, functions here are not 14 | * templated, and do not necessarily have the same name for different 15 | * parameter types. `on_device/builtins.cuh` functions do _use_ PTX wrapper 16 | * functions as their implementation. 17 | */ 18 | 19 | #pragma once 20 | #ifndef CUDA_KAT_ON_DEVICE_PTX_CUH_ 21 | #define CUDA_KAT_ON_DEVICE_PTX_CUH_ 22 | 23 | #if ! __CUDA_ARCH__ >= 300 24 | #error "This code can only target devices of compute capability 3.0 or higher." 25 | #endif 26 | 27 | namespace kat { 28 | 29 | /** 30 | * @brief Code exposing CUDA's PTX intermediate representation instructions 31 | * to C++ code. 32 | * 33 | * With CUDA, device-side code is compiled from a C++-like language to an 34 | * intermediate representation (IR), which is not supported directly by any 35 | * GPU, but from which it is easy to compile. 36 | * 37 | * Occasionally, a developer wants to use a specific PTX instruction - e.g. 38 | * to optimize some code. CUDA's headers expose some of the opcodes for these 39 | * instructions - but not all of them. Also, the exposed instructions are 40 | * not templated on the arguments - while PTX instructions _are_ thus 41 | * templated. These two gaps are filled by this library. 42 | */ 43 | namespace ptx { } 44 | 45 | } // namespace kat 46 | 47 | #include "ptx/special_registers.cuh" 48 | #include "ptx/miscellany.cuh" 49 | #include "ptx/video_instructions.cuh" 50 | 51 | #endif // CUDA_KAT_ON_DEVICE_PTX_CUH_ 52 | -------------------------------------------------------------------------------- /src/kat/on_device/ptx/detail/define_macros.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Notes: 3 | * 4 | * - Prefer including ../ptx.cuh rather than this file directly 5 | * - Including this file "pollutes" the rest of your code with preprocessor 6 | * includes You may not want. To get rid of them, include 7 | * ptx_clear_macros.cuh afterwards 8 | */ 9 | 10 | #ifndef PTX_UTILITY_MACROS_DEFINED 11 | #define PTX_UTILITY_MACROS_DEFINED 12 | 13 | #include // for uintXX_t types 14 | 15 | #define PTX_STRINGIFY(_q) #_q 16 | 17 | // Mnemonic: "h" for half, "r" for regular, "l" for long, "f" and "d" for float and double 18 | #define SIZE_CONSTRAINT_s16 "h" 19 | #define SIZE_CONSTRAINT_u16 "h" 20 | #define SIZE_CONSTRAINT_s32 "r" 21 | #define SIZE_CONSTRAINT_u32 "r" 22 | #define SIZE_CONSTRAINT_s64 "l" 23 | #define SIZE_CONSTRAINT_u64 "l" 24 | #define SIZE_CONSTRAINT_f32 "f" 25 | #define SIZE_CONSTRAINT_f64 "d" 26 | 27 | /* 28 | * In PTX inline assembly, every variable name must be preceded by a string indicating its size. 29 | * Why, would you ask - if the variable _has_ a size which the compiler knows? Just because. 30 | * This maps your PTX-style type to its appropriate size indicator string 31 | */ 32 | #define SIZE_CONSTRAINT(ptx_value_type) SIZE_CONSTRAINT_ ## ptx_value_type 33 | 34 | /* 35 | * Always use this as (part of) the 36 | * constraint string for pointer arguments to PTX inline assembly instructions 37 | * (see http://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints) 38 | */ 39 | #if defined(_WIN64) || defined(__LP64__) 40 | #define PTR_SIZE_CONSTRAINT SIZE_CONSTRAINT(u64) 41 | #else 42 | #define PTR_SIZE_CONSTRAINT SIZE_CONSTRAINT(u32) 43 | #endif 44 | 45 | #define CPP_TYPE_BY_PTX_TYPE_s16 int16_t 46 | #define CPP_TYPE_BY_PTX_TYPE_s32 int32_t 47 | #define CPP_TYPE_BY_PTX_TYPE_s64 int64_t 48 | #define CPP_TYPE_BY_PTX_TYPE_u16 uint16_t 49 | #define CPP_TYPE_BY_PTX_TYPE_u32 uint32_t 50 | #define CPP_TYPE_BY_PTX_TYPE_u64 uint64_t 51 | #define CPP_TYPE_BY_PTX_TYPE_f32 float 52 | #define CPP_TYPE_BY_PTX_TYPE_f64 double 53 | 54 | /* 55 | * In our PTX wrapper, we need to declare function parameters and local variables 56 | * basedon PTX-style types; this is a mechanism for obtaining the corresponding C++ 57 | * type (at the preprocessor level). 58 | */ 59 | #define CPP_TYPE_BY_PTX_TYPE(ptx_value_type) CPP_TYPE_BY_PTX_TYPE_ ## ptx_value_type 60 | 61 | #define MAKE_UNSIGNED_s16 u16 62 | #define MAKE_UNSIGNED_s32 u32 63 | #define MAKE_UNSIGNED_s64 u64 64 | #define MAKE_UNSIGNED_u16 u16 65 | #define MAKE_UNSIGNED_u32 u32 66 | #define MAKE_UNSIGNED_u64 u64 67 | 68 | /* 69 | * This converts specifiers of signed PTX types into their unsigned equivalent, 70 | * textually. 71 | */ 72 | #define MAKE_UNSIGNED(ptx_value_type) MAKE_UNSIGNED_ ## ptx_value_type 73 | 74 | #endif // PTX_UTILITY_MACROS_DEFINED 75 | 76 | -------------------------------------------------------------------------------- /src/kat/on_device/ptx/detail/undefine_macros.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Notes: 3 | * 4 | * - Try not to use this file directly, but rather just ../ptx.cuh 5 | * - If you are using this file directly - include it after having 6 | * included utilities.cuh - and any other include files which 7 | * uses these utilities. 8 | */ 9 | 10 | #ifdef PTX_UTILITY_MACROS_DEFINED 11 | 12 | #undef PTR_SIZE_CONSTRAINT 13 | #undef CPLUSPLUS_VARIABLE_TYPE 14 | #undef CPLUSPLUS_VARIABLE_TYPE_u16 15 | #undef CPLUSPLUS_VARIABLE_TYPE_u32 16 | #undef CPLUSPLUS_VARIABLE_TYPE_u64 17 | #undef CPLUSPLUS_VARIABLE_TYPE_f32 18 | #undef CPLUSPLUS_VARIABLE_TYPE_f64 19 | #undef SIZE_CONSTRAINT 20 | #undef SIZE_CONSTRAINT_u16 21 | #undef SIZE_CONSTRAINT_u32 22 | #undef SIZE_CONSTRAINT_u64 23 | #undef SIZE_CONSTRAINT_f32 24 | #undef SIZE_CONSTRAINT_f64 25 | 26 | #undef PTX_STRINGIFY 27 | 28 | #undef PTX_UTILITY_MACROS_DEFINED 29 | 30 | #endif // PTX_UTILITY_MACROS_DEFINED 31 | -------------------------------------------------------------------------------- /src/kat/on_device/ptx/miscellany.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file ptx/miscellany.cuh Non-templated wrappers for PTX instructions, which nVIDIA 3 | * does not provide wrappers for through the CUDA `` header. 4 | */ 5 | #pragma once 6 | #ifndef CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_ 7 | #define CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_ 8 | 9 | #include "detail/define_macros.cuh" 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | ///@cond 18 | #include 19 | ///@endcond 20 | 21 | namespace kat { 22 | 23 | namespace ptx { 24 | 25 | /** 26 | * @brief Aborts execution (of the entire kernel grid) and generates an interrupt to the host CPU. 27 | */ 28 | KAT_FD void trap() 29 | { 30 | asm("trap;"); 31 | } 32 | 33 | /** 34 | * Ends execution of the current thread of this kernel/grid 35 | */ 36 | KAT_FD void exit() 37 | { 38 | asm("exit;"); 39 | } 40 | 41 | /** 42 | * See relevant section 43 | * of the CUDA PTX reference for details on these instructions. 44 | */ 45 | #define DEFINE_IS_IN_MEMORY_SPACE(_which_space) \ 46 | KAT_FD int32_t is_in_ ## _which_space ## _memory (const void *ptr) \ 47 | { \ 48 | int32_t result; \ 49 | asm ("{\n\t" \ 50 | ".reg .pred p;\n\t" \ 51 | "isspacep." PTX_STRINGIFY(_which_space) " p, %1;\n\t" \ 52 | "selp.b32 %0, 1, 0, p;\n\t" \ 53 | "}" \ 54 | : "=r"(result) : PTR_SIZE_CONSTRAINT(ptr)); \ 55 | return result; \ 56 | } 57 | 58 | DEFINE_IS_IN_MEMORY_SPACE(const) // is_in_const_memory 59 | DEFINE_IS_IN_MEMORY_SPACE(global) // is_in_global_memory 60 | DEFINE_IS_IN_MEMORY_SPACE(local) // is_in_local_memory 61 | DEFINE_IS_IN_MEMORY_SPACE(shared) // is_in_shared_memory 62 | 63 | #undef DEFINE_IS_IN_MEMORY_SPACE 64 | 65 | /* 66 | * @brief Find the last non-sign bit in a signed or an unsigned integer value 67 | * 68 | * @note See relevant section 69 | * of the CUDA PTX reference for details on this instruction. 70 | * 71 | * @param val the value in which to find non-sign bits 72 | * @return the bit index (counting from least significant bit being 0) of the first 73 | * bit which is 0 if @p val is positive, or of the first bit which is 1 if @p val is negative. If @p val has only 74 | * sign bits (i.e. if it's 0 or if its type is signed and its bits are all 1) - the value 0xFFFFFFFF (-1) is returned 75 | */ 76 | 77 | #define DEFINE_BFIND(ptx_type) \ 78 | KAT_FD uint32_t \ 79 | bfind(CPP_TYPE_BY_PTX_TYPE(ptx_type) val) \ 80 | { \ 81 | uint32_t ret; \ 82 | asm ( \ 83 | "bfind." PTX_STRINGIFY(ptx_type) " %0, %1;" \ 84 | : "=r"(ret) : SIZE_CONSTRAINT(ptx_type) (val)); \ 85 | return ret; \ 86 | } 87 | 88 | DEFINE_BFIND(s32) // bfind 89 | DEFINE_BFIND(s64) // bfind 90 | DEFINE_BFIND(u32) // bfind 91 | DEFINE_BFIND(u64) // bfind 92 | 93 | #undef DEFINE_BFIND 94 | 95 | #define DEFINE_PRMT_WITH_MODE(selection_mode_name, selection_mode) \ 96 | KAT_FD uint32_t prmt_ ## selection_mode_name (uint32_t first, uint32_t second, uint32_t control_bits) \ 97 | { \ 98 | uint32_t result; \ 99 | asm("prmt.b32." PTX_STRINGIFY(selection_mode) " %0, %1, %2, %3;" \ 100 | : "=r"(result) : "r"(first), "r"(second), "r"(control_bits)); \ 101 | return result; \ 102 | } 103 | 104 | /* 105 | * See: 106 | * @url http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt 107 | * for information about these instructions 108 | */ 109 | DEFINE_PRMT_WITH_MODE( forward_4_extract, f4e ) // prmt_forward_4_extract 110 | DEFINE_PRMT_WITH_MODE( backward_4_extract, b4e ) // prmt_backward_4_extract 111 | DEFINE_PRMT_WITH_MODE( replicate_8, rc8 ) // prmt_replicate_8 112 | DEFINE_PRMT_WITH_MODE( replicate_16, rc16 ) // prmt_replicate_16 113 | DEFINE_PRMT_WITH_MODE( edge_clam_left, ecl ) // prmt_edge_clam_left 114 | DEFINE_PRMT_WITH_MODE( edge_clam_right, ecl ) // prmt_edge_clam_right 115 | 116 | 117 | /** 118 | * @brief See: relevant section 119 | * of the CUDA PTX reference for an explanation of what this does exactly 120 | * 121 | * @param first a first value from which to potentially use bytes 122 | * @param second a second value from which to potentially use bytes 123 | * @param byte_selectors a packing of 4 selector structures; each selector structure 124 | * is 3 bits specifying which of the input bytes are to be used (as there are 8 125 | * bytes overall in @p first and @p second ), and another bit specifying if it's an 126 | * actual copy of a byte, or instead whether the sign of the byte (intrepeted as 127 | * an int8_t) should be replicated to fill the target byte. 128 | * @return the four bytes of first and/or second, or replicated signs thereof, indicated by the byte selectors 129 | * 130 | * @note Only the lower 16 bits of byte_selectors are used. 131 | * @note "prmt" stands for "permute" 132 | */ 133 | KAT_FD uint32_t prmt(uint32_t first, uint32_t second, uint32_t byte_selectors) 134 | { 135 | uint32_t result; 136 | asm("prmt.b32 %0, %1, %2, %3;" 137 | : "=r"(result) : "r"(first), "r"(second), "r"(byte_selectors)); 138 | return result; 139 | } 140 | 141 | 142 | /** 143 | * @brief Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting 144 | * from least to most significant, from a bit field field. Has sign extension semantics 145 | * for signed inputs which are bit tricky, see in the PTX ISA guide: 146 | * 147 | * http://docs.nvidia.com/cuda/parallel-thread-execution/index.html 148 | * 149 | * TODO: CUB 1.5.2's BFE wrapper seems kind of fishy. Why does Duane Merill not use PTX for extraction from 64-bit fields? 150 | * I'll take a different route. 151 | */ 152 | #define DEFINE_BFE(ptx_type) \ 153 | KAT_FD CPP_TYPE_BY_PTX_TYPE(ptx_type) \ 154 | bfe( \ 155 | CPP_TYPE_BY_PTX_TYPE(ptx_type) bits, \ 156 | uint32_t start_position, \ 157 | uint32_t num_bits) \ 158 | { \ 159 | CPP_TYPE_BY_PTX_TYPE(ptx_type) extracted_bits; \ 160 | asm ( \ 161 | "bfe." PTX_STRINGIFY(ptx_type) " %0, %1, %2, %3;" \ 162 | : "=" SIZE_CONSTRAINT(ptx_type) (extracted_bits) \ 163 | : SIZE_CONSTRAINT(ptx_type) (bits) \ 164 | , "r" (start_position) \ 165 | , "r" (num_bits) \ 166 | );\ 167 | return extracted_bits; \ 168 | } 169 | 170 | DEFINE_BFE(s32) // bfe 171 | DEFINE_BFE(s64) // bfe 172 | DEFINE_BFE(u32) // bfe 173 | DEFINE_BFE(u64) // bfe 174 | 175 | #undef DEFINE_BFE 176 | 177 | KAT_FD uint32_t 178 | bfi( 179 | uint32_t bits_to_insert, 180 | uint32_t existing_bit_field, 181 | uint32_t start_position, 182 | uint32_t num_bits) 183 | { 184 | uint32_t ret; 185 | asm ( 186 | "bfi.b32 %0, %1, %2, %3, %4;" 187 | : "=r"(ret) 188 | : "r"(bits_to_insert) 189 | , "r"(existing_bit_field) 190 | , "r"(start_position) 191 | , "r"(num_bits) 192 | ); 193 | return ret; 194 | } 195 | 196 | KAT_FD uint64_t 197 | bfi( 198 | uint64_t bits_to_insert, 199 | uint64_t existing_bit_field, 200 | uint32_t start_position, 201 | uint32_t num_bits) 202 | { 203 | uint64_t ret; 204 | asm ( 205 | "bfi.b64 %0, %1, %2, %3, %4;" 206 | : "=l"(ret) 207 | : "l"(bits_to_insert) 208 | , "l"(existing_bit_field) 209 | , "r"(start_position) 210 | , "r"(num_bits) 211 | ); 212 | return ret; 213 | } 214 | 215 | /** 216 | * @brief Adds the absolute difference of two values to a base value 217 | * 218 | * @param x value from which to subtract @p y 219 | * @param y value to subtract from @p x 220 | * @param addend base value to which to add `|x-y|` 221 | * 222 | * @return `addend + |x - y|` 223 | */ 224 | #define DEFINE_SAD(ptx_type_1, unsigned_ptx_type_1) \ 225 | KAT_FD CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) sad( \ 226 | CPP_TYPE_BY_PTX_TYPE(ptx_type_1) x, \ 227 | CPP_TYPE_BY_PTX_TYPE(ptx_type_1) y, \ 228 | CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) addend) \ 229 | { \ 230 | CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) result; \ 231 | asm ( \ 232 | "sad." PTX_STRINGIFY(ptx_type_1) " %0, %1, %2, %3;" \ 233 | : "=" SIZE_CONSTRAINT(unsigned_ptx_type_1) (result) \ 234 | : SIZE_CONSTRAINT(ptx_type_1) (x) \ 235 | , SIZE_CONSTRAINT(ptx_type_1) (y) \ 236 | , SIZE_CONSTRAINT(unsigned_ptx_type_1) (addend) \ 237 | );\ 238 | return result; \ 239 | } 240 | 241 | #define DEFINE_SAD_(x) DEFINE_SAD(x, MAKE_UNSIGNED(x)); 242 | DEFINE_SAD_(u16); 243 | DEFINE_SAD_(u32); 244 | DEFINE_SAD_(u64); 245 | DEFINE_SAD_(s16); 246 | DEFINE_SAD_(s32); 247 | DEFINE_SAD_(s64); 248 | 249 | #undef DEFINE_SAD_ 250 | #undef DEFINE_SAD 251 | 252 | } // namespace ptx 253 | } // namespace kat 254 | 255 | 256 | #include "detail/undefine_macros.cuh" 257 | 258 | #endif // CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_ 259 | 260 | -------------------------------------------------------------------------------- /src/kat/on_device/ptx/special_registers.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file kat/on_device/ptx/special_registers.cuh 3 | * 4 | * @brief PTX instruction wrapper functions for accessing special on-GPU-core registers. 5 | */ 6 | #pragma once 7 | #ifndef CUDA_KAT_PTX_SPECIAL_REGISTERS_CUH_ 8 | #define CUDA_KAT_PTX_SPECIAL_REGISTERS_CUH_ 9 | 10 | #include "detail/define_macros.cuh" 11 | 12 | namespace kat { 13 | namespace ptx { 14 | 15 | /** 16 | * @brief Wrappers for instructions obtaining the value of one of the special hardware registers on nVIDIA GPUs. 17 | * 18 | * See the relevant section 19 | * of the PTX instruction set guide for more details. 20 | */ 21 | namespace special_registers { 22 | 23 | 24 | #define DEFINE_SPECIAL_REGISTER_GETTER(special_register_name, ptx_value_type) \ 25 | KAT_FD CPP_TYPE_BY_PTX_TYPE(ptx_value_type) special_register_name() \ 26 | { \ 27 | CPP_TYPE_BY_PTX_TYPE(ptx_value_type) ret; \ 28 | asm volatile ("mov." PTX_STRINGIFY(ptx_value_type) "%0, %" PTX_STRINGIFY(special_register_name) ";" : "=" SIZE_CONSTRAINT(ptx_value_type) (ret)); \ 29 | return ret; \ 30 | } \ 31 | 32 | DEFINE_SPECIAL_REGISTER_GETTER( laneid, u32); // PTX 1.3 33 | DEFINE_SPECIAL_REGISTER_GETTER( gridid, u64); // PTX 3.0 34 | DEFINE_SPECIAL_REGISTER_GETTER( smid, u32); // PTX 1.3 35 | DEFINE_SPECIAL_REGISTER_GETTER( nsmid, u32); // PTX 2.0 36 | DEFINE_SPECIAL_REGISTER_GETTER( clock, u32); // PTX 1.0 37 | DEFINE_SPECIAL_REGISTER_GETTER( clock_hi, u32); // PTX 5.0 38 | DEFINE_SPECIAL_REGISTER_GETTER( clock64, u64); // PTX 2.0 39 | DEFINE_SPECIAL_REGISTER_GETTER( globaltimer_hi, u32); // PTX 3.1 40 | DEFINE_SPECIAL_REGISTER_GETTER( globaltimer_lo, u32); // PTX 3.1 41 | DEFINE_SPECIAL_REGISTER_GETTER( globaltimer, u64); // PTX 3.1 42 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_lt, u32); // PTX 2.0 43 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_le, u32); // PTX 2.0 44 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_eq, u32); // PTX 2.0 45 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_ge, u32); // PTX 2.0 46 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_gt, u32); // PTX 2.0 47 | DEFINE_SPECIAL_REGISTER_GETTER( dynamic_smem_size, u32); // PTX 4.1 48 | DEFINE_SPECIAL_REGISTER_GETTER( total_smem_size, u32); // PTX 4.1 49 | 50 | #undef DEFINE_SPECIAL_REGISTER_GETTER 51 | 52 | 53 | /* 54 | * Not defining getters for: 55 | * 56 | * %tid - available as threadIdx 57 | * %ntid - available as blockDim 58 | * %warpid - not interesting 59 | * %nwarpid - not interesting 60 | * %ctaid - available as blockId 61 | * %nctaid - available as gridDim 62 | * %pm0, ..., %pm7 - not interesting, for now (performance monitoring) 63 | * %pm0_64, ..., %pm7_64 - not interesting, for now (performance monitoring) 64 | * %envreg0, ..., %envreg31 - not interesting, for now 65 | */ 66 | 67 | 68 | } // namespace special_registers 69 | 70 | } // namespace ptx 71 | 72 | } // namespace kat 73 | 74 | #include "detail/undefine_macros.cuh" 75 | 76 | 77 | #endif // CUDA_KAT_PTX_SPECIAL_REGISTERS_CUH_ 78 | -------------------------------------------------------------------------------- /src/kat/on_device/ptx/video_instructions.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file ptx/video_instructions.cuh Non-templated wrappers for PTX "video" 3 | * instructions, which nVIDIA does not provide wrappers for through the CUDA 4 | * `` header 5 | * 6 | * "Video" instructions are not really about video (although they're probably used 7 | * for video somehow). Essentially they're instructions which combine another 8 | * operation, and another operand, after the main one; additionally, they offer 9 | * variants with all sorts of saturation, wraparound, sign-extension and similar 10 | * bells and whistles. 11 | * 12 | * These instructions (at least, the "scalar" ones) are: 13 | * 14 | * 15 | * vadd - addition 16 | * vsub - subtraction 17 | * vabsdiff - absolute difference 18 | * vmin - minimum 19 | * vmax - maximum 20 | * vshl - shift left 21 | * vshr - shift right 22 | * vmad - multiply-and-add 23 | * vset - equality check 24 | * 25 | * For now, we won't implement most of these instructions, and even for the ones 26 | * we do implement - we'll only choose some of the variants. 27 | */ 28 | #pragma once 29 | #ifndef CUDA_KAT_PTX_VIDEO_INSTRUCTIONS_CUH_ 30 | #define CUDA_KAT_PTX_VIDEO_INSTRUCTIONS_CUH_ 31 | 32 | #include "detail/define_macros.cuh" 33 | #include 34 | #include 35 | 36 | 37 | ///@cond 38 | #include 39 | ///@endcond 40 | 41 | namespace kat { 42 | namespace ptx { 43 | 44 | /** 45 | * @brief bit shift, then apply a binary operator. 46 | * 47 | */ 48 | #define DEFINE_SHIFT_AND_OP(direction, second_op) \ 49 | KAT_FD uint32_t \ 50 | vsh##direction##_##second_op ( \ 51 | uint32_t x, \ 52 | uint32_t shift_amount, \ 53 | uint32_t extra_operand) \ 54 | { \ 55 | uint32_t ret; \ 56 | asm ("vsh" PTX_STRINGIFY(direction) ".u32.u32.u32.clamp." PTX_STRINGIFY(second_op) " %0, %1, %2, %3;" \ 57 | : "=r"(ret) \ 58 | : "r"(x) \ 59 | , "r"(shift_amount) \ 60 | , "r"(extra_operand) \ 61 | ); \ 62 | return ret; \ 63 | } 64 | 65 | DEFINE_SHIFT_AND_OP(l,add) // vshl_add 66 | DEFINE_SHIFT_AND_OP(l,min) // vshl_min 67 | DEFINE_SHIFT_AND_OP(l,max) // vshl_max 68 | DEFINE_SHIFT_AND_OP(r,add) // vshr_add 69 | DEFINE_SHIFT_AND_OP(r,min) // vshr_min 70 | DEFINE_SHIFT_AND_OP(r,max) // vshr_max 71 | 72 | 73 | } // namespace ptx 74 | } // namespace kat 75 | 76 | 77 | #include "detail/undefine_macros.cuh" 78 | 79 | #endif // CUDA_KAT_PTX_VIDEO_INSTRUCTIONS_CUH_ 80 | -------------------------------------------------------------------------------- /src/kat/on_device/sequence_ops/common.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/sequence_ops/common.cuh 3 | * 4 | * @brief Some common definitions for all on-device collaborative sequence operations 5 | */ 6 | 7 | #ifndef CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_COMMON_CUH_ 8 | #define CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_COMMON_CUH_ 9 | 10 | #include 11 | #include 12 | 13 | namespace kat { 14 | namespace collaborative { 15 | 16 | enum inclusivity_t : bool { 17 | Exclusive = false, 18 | Inclusive = true 19 | }; 20 | 21 | namespace detail { 22 | 23 | /** 24 | * In a "full warp write", we want [1] each lane to write an integral number 25 | * native words (at the moment and for the foreseeable future, 4-byte integers). 26 | * At the same time, the lane writes complete elements of type T, not arbitrary 27 | * sequences of `sizeof(native_word_t)`, hence this definition. 28 | * 29 | * @todo: Can't we assume that T is a POD type, and just have lanes not write 30 | * complete T's? 31 | */ 32 | template 33 | struct elements_per_lane_in_full_warp_write { 34 | enum { value = sizeof(native_word_t) / constexpr_::gcd(sizeof(native_word_t),sizeof(T)) }; 35 | }; 36 | } // namespace detail 37 | 38 | } // namespace collaborative 39 | } // namespace kat 40 | 41 | #endif // CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_COMMON_CUH_ 42 | -------------------------------------------------------------------------------- /src/kat/on_device/sequence_ops/grid.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/collaboration/grid.cuh 3 | * 4 | * @brief CUDA device computation grid-level primitives, i.e. those involving 5 | * interaction of threads from different blocks in the grid 6 | * 7 | */ 8 | 9 | #pragma once 10 | #ifndef CUDA_KAT_GRID_COLLABORATIVE_SEQUENCE_OPS_CUH_ 11 | #define CUDA_KAT_GRID_COLLABORATIVE_SEQUENCE_OPS_CUH_ 12 | 13 | #include "common.cuh" 14 | #include 15 | #include 16 | 17 | ///@cond 18 | #include 19 | ///@endcond 20 | 21 | namespace kat { 22 | namespace collaborative { 23 | namespace warp_to_grid { 24 | 25 | /** 26 | * Used by multiple warps, in multiple blocks with each warp having 27 | * a bunch of data it has obtained and all warps' data must be 28 | * chained into a global-memory vector - with no gaps and no 29 | * overwriting (but not necessarily in the order of warps, just any 30 | * order.) 31 | * 32 | * @note if the input is not 32-byte (sometimes 128-byte )-aligned, 33 | * and more importantly, the output is not 128-byte-aligned, 34 | * performance will likely degrade due to the need to execute a pair 35 | * of memory transactions for every single 32 x 4 byte write. 36 | * 37 | * @note this must be called by complete warps, with all lanes 38 | * active and participating. But it does _not_ - for the time 39 | * being - have to called by complete blocks. 40 | * 41 | * @tparam T the type of data elements being copied 42 | * @tparam Size must fit any index used into the input or output array; 43 | * for the general case it would be 64-bit, but this is 44 | * usable also for when you need 32-bit work (e.g. a 32-bit length 45 | * output variable). 46 | * @param global_output 47 | * @param global_output_length 48 | * @param fragment_to_append 49 | * @param fragment_length 50 | */ 51 | template 52 | KAT_FD void append_to_global_memory( 53 | T* __restrict__ global_output, 54 | Size* __restrict__ global_output_length, 55 | T* __restrict__ fragment_to_append, 56 | Size __restrict__ fragment_length) 57 | { 58 | using namespace grid_info; 59 | Size previous_output_size = thread::is_first_in_warp() ? 60 | atomic::add(global_output_length, fragment_length) : 0; 61 | Size offset_to_start_writing_at = collaborative::warp::get_from_first_lane( 62 | previous_output_size); 63 | 64 | // Now the (0-based) positions 65 | // previous_output_size ... previous_output_size + fragment_length - 1 66 | // are reserved by this warp; nobody else will write there and we don't need 67 | // any more atomics 68 | 69 | enum : bool { may_have_slack = true }; 70 | 71 | if (detail::elements_per_lane_in_full_warp_write::value > 1) { 72 | // We don't have a version of copy which handles unaligned destinations, so 73 | warp::detail::naive_copy(global_output + offset_to_start_writing_at, 74 | fragment_to_append, fragment_length); 75 | } 76 | else { 77 | warp::copy_n( 78 | global_output + offset_to_start_writing_at, 79 | fragment_to_append, fragment_length); 80 | } 81 | } 82 | 83 | } // namespace warp_to_grid 84 | } // namespace collaborative 85 | } // namespace kat 86 | 87 | #endif // CUDA_KAT_GRID_COLLABORATIVE_SEQUENCE_OPS_CUH_ 88 | -------------------------------------------------------------------------------- /src/kat/on_device/shared_memory.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/shared_memory.cuh 3 | * 4 | * @brief Utility code for working with (mostly dynamic) shared memory 5 | * in device-side CUDA functions. 6 | */ 7 | 8 | #ifndef CUDA_KAT_ON_DEVICE_SHARED_MEMORY_CUH_ 9 | #define CUDA_KAT_ON_DEVICE_SHARED_MEMORY_CUH_ 10 | 11 | #include "shared_memory/basic.cuh" 12 | #include "shared_memory/operations.cuh" 13 | 14 | #endif // CUDA_KAT_ON_DEVICE_SHARED_MEMORY_CUH_ 15 | -------------------------------------------------------------------------------- /src/kat/on_device/shared_memory/basic.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/shared_memory/basic.cuh 3 | * 4 | * @brief Simpler / more basic utility code for working with shared memory, 5 | * not involving any actual computation. 6 | * 7 | */ 8 | 9 | #ifndef CUDA_KAT_ON_DEVICE_SHARED_MEMORY_BASIC_CUH_ 10 | #define CUDA_KAT_ON_DEVICE_SHARED_MEMORY_BASIC_CUH_ 11 | 12 | #include 13 | #include 14 | 15 | 16 | ///@cond 17 | #include 18 | ///@endcond 19 | 20 | namespace kat { 21 | namespace shared_memory { 22 | 23 | using offset_t = int; // Perhaps make it an int32_t ? 24 | using size_t = unsigned; // Should we make it signed like ssize_t ? 25 | 26 | /** 27 | * @brief Obtain the total size in bytes of the (per-block) shared memory 28 | * for the running kernel - static + dynamic 29 | * 30 | * @note requires special register access which is not so cheap. 31 | * 32 | */ 33 | KAT_FD size_t size() { 34 | return ptx::special_registers::total_smem_size(); 35 | } 36 | 37 | namespace static_ { 38 | 39 | /** 40 | * @brief Obtain the size in bytes of the (per-block) static shared memory 41 | * for the running kernel. 42 | * 43 | * @note requires special register access which is not so cheap. 44 | */ 45 | KAT_FD size_t size() { 46 | return 47 | ptx::special_registers::total_smem_size() - 48 | ptx::special_registers::dynamic_smem_size(); 49 | } 50 | 51 | } // namespace static_ 52 | 53 | namespace dynamic { 54 | 55 | /** 56 | * @brief Obtain the size of the (per-block) dynamic shared_memory for 57 | * the running kernel 58 | * 59 | * @note without a template parameter, returns the size in bytes 60 | * @note requires special register access which is not so cheap. 61 | */ 62 | template 63 | KAT_FD size_t size() { 64 | return ptx::special_registers::dynamic_smem_size() / sizeof(T); 65 | } 66 | 67 | /** 68 | * This gadget is necessary for using dynamically-sized shared memory in 69 | * templated kernels (i.e. shared memory whose size is set by the launch 70 | * parameters rather than being fixed at compile time). Use of such 71 | * memory requires a `__shared__ extern` unspecified-size array variable; 72 | * however, the way nvcc works, you cannot declare two such variables of 73 | * different types in your program - even if they're in different scopes. 74 | * That means we either need to have a different variable name for each 75 | * type (which would lead us into preprocessor macro hell), or - just 76 | * use the same type, and reintrepret according to the type we want... 77 | * which is what this gadget does. 78 | * 79 | * @note all threads would get the same address when calling this function, 80 | * so you would need to add different offsets for different threads if 81 | * you want a warp-specific or thread-specific pointer. 82 | * 83 | * @note see also https://stackoverflow.com/questions/27570552/ 84 | */ 85 | template 86 | KAT_DEV T* proxy() 87 | { 88 | // TODO: Do we need this alignment? Probably not 89 | extern __shared__ __align__(1024) unsigned char memory[]; 90 | return reinterpret_cast(memory); 91 | } 92 | 93 | // TODO: It would be nice to get the shared memory as a span; but we 94 | // don't currently have a span in this repository; and both std::span 95 | // and GSL/span do not support CUDA. 96 | 97 | /** 98 | * @note This namespace's contents is only relevant for linear grids 99 | */ 100 | namespace warp_specific { 101 | 102 | /** 103 | * @brief Accesses the calling thread's warp-specific dynamic shared memory - 104 | * assuming the warps voluntarily divvy up the shared memory beyond some 105 | * point amongst themselves, using striding. 106 | * 107 | * The partitioning pattern is for each warp to get elements at a fixed 108 | * stride rather than a contiguous set of elements; this pattern ensures 109 | * that different warps are never in a bank conflict when accessing their 110 | * "private" shared memory - provided the number of warps divides 32, or is a 111 | * multiple of 32. The downside of this pattern is that different lanes accessing 112 | * different elements in a warp's shared memory will likely be in bank conflict 113 | * (and certainly be in conflict if there are 32 warps). 114 | * 115 | * @tparam T the element type assumed for all shared memory (or at least for 116 | * alignment and for the warp-specific shared memory) 117 | * @param base_offset How far into the block's overall shared memory to 118 | * start partitioning the memory into warp-specific sequences 119 | * @param num_elements_per_warp Size in elements of the area agreed to 120 | * be specific to each warp 121 | * @return Address of the first warp-specific element in shared memory 122 | */ 123 | template 124 | KAT_FD T* contiguous(unsigned num_elements_per_warp, offset_t base_offset = 0) 125 | { 126 | return proxy() + base_offset + 127 | num_elements_per_warp * linear_grid::grid_info::warp::index_in_block(); 128 | } 129 | 130 | /** 131 | * @brief Accesses the calling thread's warp-specific dynamic shared memory - 132 | * assuming the warps voluntarily divvy up the shared memory beyond some 133 | * point amongst themselves into contiguous areas. 134 | * 135 | * The partitioning pattern is for each warp to get a contiguous sequence 136 | * of elements in memory. 137 | * 138 | * @tparam T the element type assumed for all shared memory (or at least for 139 | * alignment and for the warp-specific shared memory) 140 | * @param base_offset How far into the block's overall shared memory to 141 | * start partitioning the memory into warp-specific sequences 142 | * @return Address of the first warp-specific element in shared memory 143 | */ 144 | template 145 | KAT_FD T* strided(offset_t base_offset = 0) 146 | { 147 | return proxy() + base_offset + linear_grid::grid_info::warp::index_in_block(); 148 | } 149 | 150 | } // namespace warp_specific 151 | 152 | } // namespace dynamic 153 | } // namespace shared_memory 154 | } // namespace kat 155 | 156 | #endif // CUDA_KAT_ON_DEVICE_SHARED_MEMORY_BASIC_CUH_ 157 | -------------------------------------------------------------------------------- /src/kat/on_device/shared_memory/operations.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/shared_memory/operations.cuh 3 | * 4 | * @brief Some basic operations on shared memory (using the library's general 5 | * computational primitives) 6 | * 7 | */ 8 | 9 | #pragma once 10 | #ifndef CUDA_KAT_SHARED_MEMORY_OPS_CUH 11 | #define CUDA_KAT_SHARED_MEMORY_OPS_CUH 12 | 13 | #include 14 | #include 15 | 16 | 17 | ///@cond 18 | #include 19 | ///@endcond 20 | 21 | namespace kat { 22 | namespace linear_grid { 23 | namespace shared_memory { 24 | 25 | using kat::shared_memory::size_t; 26 | 27 | namespace dynamic { 28 | 29 | using kat::shared_memory::dynamic::proxy; 30 | using kat::shared_memory::dynamic::size; 31 | 32 | /** 33 | * @brief Collaboratively fill the block's dynamic shared memory with a fixed 34 | * value, up to a certain point 35 | * 36 | * @tparam the element type which the block's shared memory is presumed to have 37 | * @param value each element of the block's dynamic shared memory will be 38 | * set to this value 39 | * @param length the number of T elements to set to @p value 40 | */ 41 | template 42 | KAT_FD void fill( 43 | const T& value, 44 | shared_memory::size_t length) 45 | { 46 | T tmp = value; 47 | kat::linear_grid::collaborative::block::fill_n(shared_memory::dynamic::proxy(), value, length); 48 | } 49 | 50 | /** 51 | * @brief Collaboratively fill the block's dynamic shared memory with a fixed value. 52 | * 53 | * @tparam the element type which the block's shared memory is presumed to have 54 | * @param value each element of the block's dynamic shared memory will be 55 | * set to this value 56 | * 57 | * @note This variant of `fill()` pays a small "penality" for determining 58 | * the size of the shared memory by itself, since it must access a 59 | * typically-unused special register for this purpose. If you can, prefer 60 | * passing a length yourself. 61 | */ 62 | template 63 | KAT_FD void fill(const T& value) 64 | { 65 | auto length = shared_memory::dynamic::size(); 66 | return fill(value, length); 67 | } 68 | 69 | /** 70 | * @brief Collaboratively zero-out the block's dynamic shared memory , up to a 71 | * certain point 72 | * 73 | * @tparam the element type which the block's shared memory is presumed to have 74 | * @param length the number of T elements to set to zero 75 | */ 76 | template 77 | KAT_FD void zero(kat::shared_memory::size_t length) 78 | { 79 | return fill(T{0}, length); 80 | } 81 | 82 | /** 83 | * @brief Collaboratively zero-out the block's dynamic shared memory 84 | * 85 | * @tparam the element type which the block's shared memory is presumed to have 86 | */ 87 | template 88 | KAT_FD void zero() 89 | { 90 | auto length = shared_memory::dynamic::size(); 91 | return zero(length); 92 | } 93 | 94 | 95 | /** 96 | * Sets the (beginning of the dynamic) shared memory of the block 97 | * to a copy of some area of device memory. 98 | * 99 | * @param[in] source Data in global memory (_not_ anywhere 100 | * else in shared memory! That breaks the {@code __restrict__} 101 | * restriction) which we wish to have in shared memory 102 | * @param[in] length length of the area to copy; must be 103 | * no larger than the available length (in T's) of shared 104 | * memory 105 | * @return the beginning of the block's shared memory - 106 | * which now contains a copy of the data at @p source. 107 | * 108 | * @note length is not checked to be valid - it is up to 109 | * the caller to refrain from trying to copy too much 110 | * into the shared memory; use 111 | */ 112 | template 113 | KAT_FD T* __restrict__ set_to_copy_of(const T* source, shared_memory::size_t length) 114 | { 115 | T* __restrict__ data_in_shared_mem = shared_memory::dynamic::proxy(); 116 | kat::linear_grid::collaborative::block::copy(data_in_shared_mem, source, length); 117 | return data_in_shared_mem; 118 | } 119 | 120 | } // namespace shared_memory 121 | } // namespace dynamic 122 | } // namespace linear_grid 123 | 124 | namespace shared_memory { 125 | namespace dynamic { 126 | 127 | /** 128 | * @brief Collaboratively fill the block's dynamic shared memory with a fixed 129 | * value, up to a certain point 130 | * 131 | * @tparam the element type which the block's shared memory is presumed to have 132 | * @param value each element of the block's dynamic shared memory will be 133 | * set to this value 134 | * @param length the number of T elements to set to @p value 135 | * 136 | * @note Not implemented yet - need non-linear-grid variants of some of the block primtives. 137 | */ 138 | template 139 | KAT_FD void fill( 140 | const T& value, 141 | shared_memory::size_t length); 142 | // TODO: Uncomment when the non-linear-grid block primitive is available 143 | //{ 144 | // T tmp = value; 145 | // kat::collaborative::block::fill_n(shared_memory::dynamic::proxy(), value, length); 146 | //} 147 | 148 | 149 | /** 150 | * @brief Collaboratively fill the block's dynamic shared memory with a fixed value. 151 | * 152 | * @tparam the element type which the block's shared memory is presumed to have 153 | * @param value each element of the block's dynamic shared memory will be 154 | * set to this value 155 | * 156 | * @note This variant of `fill()` pays a small "penality" for determining 157 | * the size of the shared memory by itself, since it must access a 158 | * typically-unused special register for this purpose. If you can, prefer 159 | * passing a length yourself. 160 | */ 161 | template 162 | KAT_FD void fill(const T& value) 163 | { 164 | auto length = shared_memory::dynamic::size(); 165 | return fill(value, length); 166 | } 167 | 168 | /** 169 | * @brief Collaboratively zero-out the block's dynamic shared memory , up to a 170 | * certain point 171 | * 172 | * @tparam the element type which the block's shared memory is presumed to have 173 | * @param length the number of T elements to set to zero 174 | */ 175 | template 176 | KAT_FD void zero(kat::shared_memory::size_t length) 177 | { 178 | return fill(T{0}, length); 179 | } 180 | 181 | /** 182 | * @brief Collaboratively zero-out the block's dynamic shared memory 183 | * 184 | * @tparam the element type which the block's shared memory is presumed to have 185 | */ 186 | template 187 | KAT_FD void zero() 188 | { 189 | auto length = shared_memory::dynamic::size(); 190 | return zero(length); 191 | } 192 | 193 | /** 194 | * Sets the (beginning of the dynamic) shared memory of the block 195 | * to a copy of some area of device memory. 196 | * 197 | * @param[in] source Data in global memory (_not_ anywhere 198 | * else in shared memory! That breaks the {@code __restrict__} 199 | * restriction) which we wish to have in shared memory 200 | * @param[in] length length of the area to copy; must be 201 | * no larger than the available length (in T's) of shared 202 | * memory 203 | * @return the beginning of the block's shared memory - 204 | * which now contains a copy of the data at @p source. 205 | * 206 | * @note length is not checked to be valid - it is up to 207 | * the caller to refrain from trying to copy too much 208 | * into the shared memory. 209 | * 210 | * @note Not implemented yet - need non-linear-grid variants of 211 | * some of the block primitives. 212 | */ 213 | template 214 | KAT_FD T* __restrict__ set_to_copy_of(const T* source, shared_memory::size_t length); 215 | // TODO: Uncomment when the non-linear-grid block primitive is available 216 | //{ 217 | // T* __restrict__ data_in_shared_mem = shared_memory::dynamic::proxy(); 218 | // kat::collaborative::block::copy(data_in_shared_mem, source, length); 219 | // return data_in_shared_mem; 220 | //} 221 | 222 | } // namespace dynamic 223 | } // namespace shared_memory 224 | 225 | } // namespace kat 226 | 227 | #endif // CUDA_KAT_SHARED_MEMORY_OPS_CUH 228 | -------------------------------------------------------------------------------- /src/kat/on_device/shuffle.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file shuffle.cuh Templated warp-shuffle operation variants 3 | */ 4 | 5 | /* 6 | * Originally based on Bryan Catanzaro's CUDA generics 7 | * https://github.com/bryancatanzaro/generics/ 8 | * Downloaded on: 2016-04-16 9 | * ... but reimplemented by Eyal Rozenberg, CWI Amsterdam 10 | */ 11 | 12 | #pragma once 13 | #ifndef CUDA_KAT_ON_DEVICE_TEMPLATED_SHUFFLE_CUH_ 14 | #define CUDA_KAT_ON_DEVICE_TEMPLATED_SHUFFLE_CUH_ 15 | 16 | #include 17 | 18 | 19 | ///@cond 20 | #include 21 | ///@endcond 22 | 23 | namespace kat { 24 | 25 | // The functions here can be used to shuffle types as large as you like. Of course, 26 | // if they're not plain-old-data, shuffle at your peril. 27 | 28 | /** 29 | * @brief Have each lane in a warp get a value from an (arbitrary) other lane. 30 | * 31 | * @tparam T the type of datum to be shared with other lane(s); may be of 32 | * arbitrary size, but (at least for now) must be plain-old-data. 33 | * 34 | * @param t Each lane shares own value, which other lanes can choose 35 | * to receive. 36 | * @param source_lane The lane whose value the current lane wants to get 37 | * @return the @p t value of @p source_lane 38 | */ 39 | template KAT_FD T shuffle_arbitrary(const T& t, int source_lane); 40 | 41 | /** 42 | * @param t Each lane shares own value, which a lane with a higher index 43 | * will get. 44 | * @param delta The difference in lane index to the source lane of the new 45 | * value, i.e. a lane with index i gets the new value from lane i + delta. 46 | * @return The @p t value of the lane with index @p delta less than the calling 47 | * lane's; a lane with a high index, above warp_size - @p delta, has its own @p t 48 | * returned unchanged. 49 | */ 50 | template KAT_FD T shuffle_down(const T& t, unsigned int delta); 51 | 52 | /** 53 | * @tparam T the type of datum to be shared with other lane(s); may be of 54 | * arbitrary size, but (at least for now) must be plain-old-data. 55 | * 56 | * @param t Each lane shares own value, which a lane with a lower index 57 | * will get. 58 | * @param delta The difference in lane index to the source lane of the new 59 | * value, i.e. a lane with index i gets the new value from the lane of index 60 | * i - delta. 61 | * @return The @p t value of the lane with index @p delta less than the calling 62 | * lane's; a lane with a low index, under @p delta, has its own @p t returned 63 | * unchanged. 64 | */ 65 | template KAT_FD T shuffle_up(const T& t, unsigned int delta); 66 | 67 | /** 68 | * @brief Have pairs of lanes exchange a value, with the pairing performed 69 | * by XORing bits of the lane index. 70 | * 71 | * @tparam T the type of datum to be shared with other lane(s); may be of 72 | * arbitrary size, but (at least for now) must be plain-old-data. 73 | * 74 | * @param t The value to exchange with a counterpart lane 75 | * @param mask Determines how lanes will be paired: The lane with index i 76 | * is paired with the lane with index i ^ mask. 77 | * @return The @p t value of the paired lane 78 | */ 79 | template KAT_FD T shuffle_xor(const T& t, int mask); 80 | 81 | } // namespace kat 82 | 83 | #include "detail/shuffle.cuh" 84 | 85 | #endif // CUDA_KAT_ON_DEVICE_TEMPLATED_SHUFFLE_CUH_ 86 | -------------------------------------------------------------------------------- /src/kat/on_device/streams/prefix_generators.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef CUDA_KAT_OSTREAM_PREFIX_GENERATORS_CUH_ 3 | #define CUDA_KAT_OSTREAM_PREFIX_GENERATORS_CUH_ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | ///@cond 10 | #include 11 | ///@endcond 12 | 13 | namespace kat { 14 | 15 | namespace detail { 16 | KAT_DEV unsigned num_digits_required_for(unsigned long long extremal_value) 17 | { 18 | return ceilf(log10f(extremal_value)); 19 | } 20 | 21 | } // namespace detail 22 | 23 | namespace linear_grid { 24 | 25 | namespace prefix_generators { 26 | 27 | template 28 | KAT_DEV void self_identify(kat::stringstream& ss); 29 | 30 | // Prefix will look like (example for thread 34): 31 | // 32 | // "T 34 = (00,01,02) " 33 | // 34 | // ... since 34 is the third thread (index 2) in the second warp (index 1) in 35 | // the first block. 36 | // 37 | template <> 38 | KAT_DEV void self_identify(kat::stringstream& ss) 39 | { 40 | namespace gi = ::kat::linear_grid::grid_info; 41 | 42 | const auto global_thread_id_width = detail::num_digits_required_for(gi::grid::num_threads() - 1); 43 | const auto block_id_width = detail::num_digits_required_for(gi::grid::num_blocks() - 1); 44 | const auto warp_id_width = detail::num_digits_required_for(gi::grid::num_warps_per_block() - 1); 45 | const auto lane_id_width = 2; // ceilf(log10(warp_size - 1)) 46 | constexpr const auto fill_char = '0'; 47 | 48 | ss 49 | << "T " << strf::right(gi::thread::global_id(), global_thread_id_width, fill_char) 50 | << " = (" << strf::right(gi::block::id_in_grid(), block_id_width, fill_char ) 51 | << ',' << strf::right(gi::warp::id_in_block(), warp_id_width, fill_char) 52 | << ',' << strf::right(gi::lane::id(), lane_id_width, fill_char) 53 | << ") "; 54 | } 55 | 56 | 57 | // Prefix will look like (example for thread 1025 and block size 512): 58 | // 59 | // "W 32 = (02,00) " 60 | // 61 | // ... since thread 1025 overall is the second thread in the third block (block index 2), and thus in the first warp (warp index 0) 62 | // 63 | template <> 64 | KAT_DEV void self_identify(kat::stringstream& ss) 65 | { 66 | namespace gi = ::kat::linear_grid::grid_info; 67 | 68 | auto global_warp_id_width = detail::num_digits_required_for(gi::grid::num_warps() - 1); 69 | auto warp_id_width = detail::num_digits_required_for(gi::grid::num_warps_per_block() - 1); 70 | auto block_id_width = detail::num_digits_required_for(gi::grid::num_blocks() - 1); 71 | constexpr const auto fill_char = '0'; 72 | ss 73 | << "W " << strf::right(gi::warp::id_in_grid(), global_warp_id_width, fill_char) 74 | << " = (" << strf::right(gi::block::id_in_grid(), block_id_width, fill_char) 75 | << ',' << strf::right(gi::warp::id_in_block(), warp_id_width, fill_char) 76 | << ") "; 77 | } 78 | 79 | // Prefix will look like (example for thread 1025 and block size 512): 80 | // 81 | // "B 2 " 82 | // 83 | // ... since thread 1025 is in the 3rd block and block indices are 0-based 84 | // 85 | template <> 86 | KAT_DEV void self_identify(kat::stringstream& ss) 87 | { 88 | namespace gi = ::kat::linear_grid::grid_info; 89 | 90 | const unsigned block_id_width = detail::num_digits_required_for(gi::grid::num_blocks() - 1); 91 | constexpr const auto fill_char = '0'; 92 | ss << "B " << strf::right(gi::block::id_in_grid(), block_id_width, fill_char) << " : "; 93 | } 94 | 95 | template <> 96 | KAT_DEV void self_identify(kat::stringstream& ss) 97 | { 98 | ss << "G "; 99 | } 100 | 101 | 102 | } // namespace prefix_generators 103 | 104 | namespace manipulators { 105 | 106 | KAT_DEV printfing_ostream& identify( kat::printfing_ostream& os ) 107 | { 108 | using namespace kat::manipulators; 109 | prefix_generator_type gen; 110 | switch(os.printing_resolution()) { 111 | case printfing_ostream::resolution::thread : gen = prefix_generators::self_identify< printfing_ostream::resolution::thread >; break; 112 | case printfing_ostream::resolution::warp : gen = prefix_generators::self_identify< printfing_ostream::resolution::warp >; break; 113 | case printfing_ostream::resolution::block : gen = prefix_generators::self_identify< printfing_ostream::resolution::block >; break; 114 | case printfing_ostream::resolution::grid : gen = prefix_generators::self_identify< printfing_ostream::resolution::grid >; break; 115 | } 116 | return os.set_prefix_generator(gen); 117 | } 118 | } // namespace manipulators 119 | 120 | } // namespace linear_grid 121 | 122 | } // namespace kat 123 | 124 | #endif // CUDA_KAT_OSTREAM_PREFIX_GENERATORS_CUH_ 125 | -------------------------------------------------------------------------------- /src/kat/on_device/streams/printfing_ostream.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/printfing_ostream.cuh 3 | * 4 | * @brief CUDA device-side functions for a C++-standard-library-like stream 5 | * whose output (eventually) gets printed using CUDA's device-side printf(). 6 | * 7 | */ 8 | #pragma once 9 | #ifndef CUDA_KAT_ON_PRINTFING_STREAM_CUH_ 10 | #define CUDA_KAT_ON_PRINTFING_STREAM_CUH_ 11 | 12 | #include 13 | #include 14 | 15 | ///@cond 16 | #include 17 | ///@endcond 18 | 19 | namespace kat { 20 | 21 | namespace manipulators { 22 | 23 | using prefix_generator_type = void (*)(kat::stringstream&); 24 | // TODO: Make it into a function with an std-string-like outputm, when we 25 | // have an std-string like class. 26 | 27 | KAT_DEV auto prefix(prefix_generator_type gen); 28 | 29 | } 30 | 31 | 32 | class printfing_ostream 33 | { 34 | static constexpr const std::size_t cout_initial_buffer_size { 1 << 8 }; 35 | 36 | public: 37 | enum class resolution { thread, warp, block, grid }; 38 | 39 | KAT_DEV printfing_ostream(std::size_t initial_buffer_size = cout_initial_buffer_size) : main_buffer(initial_buffer_size) { } 40 | KAT_DEV printfing_ostream(printfing_ostream&& other) : main_buffer(other.main_buffer) { } 41 | KAT_DEV printfing_ostream(const printfing_ostream& other) : main_buffer(other.main_buffer) { } 42 | KAT_DEV ~printfing_ostream(); 43 | 44 | // Note: You can also use strf::flush if that exists 45 | KAT_DEV void flush() 46 | { 47 | if (not newline_on_flush and main_buffer.tellp() == 0) { 48 | // Note: Returning even though we could have a prefix 49 | return; 50 | } 51 | 52 | if (not should_act_for_resolution(printing_resolution_)) { 53 | return; 54 | } 55 | if (use_prefix) { 56 | // The prefix is re-generated as necessary 57 | prefix_generator(prefix); 58 | printf(newline_on_flush ? "%*s%*s\n" : "%*s%*s", 59 | prefix.tellp(), prefix.c_str(), 60 | main_buffer.tellp(), main_buffer.c_str()); 61 | } 62 | else { 63 | printf(newline_on_flush ? "%*s\n" : "%*s", 64 | main_buffer.tellp(), main_buffer.c_str() 65 | ); 66 | } 67 | main_buffer.clear(); 68 | prefix.clear(); // We're not caching the prefix 69 | } 70 | 71 | protected: 72 | static bool KAT_DEV should_act_for_resolution(resolution r) { 73 | // TODO: It might be a better idea to check which threads in the warp/block are still active 74 | // rather than assuming the first one is. 75 | switch(r) { 76 | case resolution::thread: return true; 77 | case resolution::warp: return grid_info::thread::is_first_in_warp(); 78 | case resolution::block: return grid_info::thread::is_first_in_block(); 79 | case resolution::grid: return grid_info::thread::is_first_in_grid(); 80 | default: return false; // but can't get here 81 | } 82 | } 83 | 84 | public: 85 | template 86 | KAT_DEV printfing_ostream& operator<<(const T& arg) 87 | { 88 | if (not should_act_for_resolution(printing_resolution_)) { return *this; } 89 | strf::print_preview no_preview; 90 | strf::make_printer(strf::rank<5>(), strf::pack(), no_preview, arg).print_to(main_buffer); 91 | return *this; 92 | } 93 | 94 | // Manipulators are a clever, but confusing, idea from the C++ standard library's 95 | // IO streams: They're functions which manipulate streams, but can also be made 96 | // to manipulatethem by being sent to them using the << operator - which instead 97 | // of actually adding any data to the stream, invokes the manipulator function. 98 | // 99 | using manipulator = kat::printfing_ostream& ( kat::printfing_ostream& ); 100 | 101 | KAT_DEV printfing_ostream& no_prefix() 102 | { 103 | use_prefix = false; 104 | prefix_generator = nullptr; 105 | prefix.clear(); // Maybe we should set it to a stringstream of size 0? 106 | return *this; 107 | } 108 | 109 | KAT_DEV printfing_ostream& set_prefix_generator(manipulators::prefix_generator_type gen) 110 | { 111 | use_prefix = true; 112 | prefix_generator = gen; 113 | return *this; 114 | } 115 | 116 | KAT_DEV printfing_ostream& no_newline_on_flush() 117 | { 118 | newline_on_flush = false; 119 | return *this; 120 | } 121 | 122 | KAT_DEV printfing_ostream& append_newline_on_flush() 123 | { 124 | newline_on_flush = true; 125 | return *this; 126 | } 127 | 128 | // Drops whatever's in the buffer. Also clears the prefix - 129 | // as that's assumed to have been resolution-related 130 | KAT_DEV printfing_ostream& set_printing_resolution(resolution new_resolution) 131 | { 132 | main_buffer.clear(); 133 | no_prefix(); 134 | printing_resolution_ = new_resolution; 135 | return *this; 136 | } 137 | 138 | // Also clears the prefix - as that's assumed to have been resolution-related 139 | KAT_DEV resolution printing_resolution() const { return printing_resolution_; } 140 | 141 | protected: 142 | kat::stringstream main_buffer; 143 | kat::stringstream prefix { 100 }; // { 0 }; 144 | // no prefix by default, so why bother allocating a buffer? 145 | // TODO: Make this an optional along with the prefix_generator 146 | 147 | bool flush_on_destruction { true }; 148 | bool newline_on_flush { false }; 149 | 150 | // We may want to prefix out printing with a string which the code using cout has not explicitly specified 151 | // beforehand. For example: An identifier of the current thread or warp. 152 | 153 | bool use_prefix { false }; 154 | // TODO: Make this into a kat::optional when we have an optional class, 155 | // and perhaps simply 156 | manipulators::prefix_generator_type prefix_generator { nullptr }; 157 | 158 | 159 | // By default, all grid threads print; but we may want a printing only once per each warp, or block etc; 160 | // that resolution is controlled by this variable. 161 | 162 | resolution printing_resolution_ { resolution::thread }; 163 | 164 | }; 165 | 166 | 167 | namespace manipulators { 168 | KAT_FD kat::printfing_ostream& flush( kat::printfing_ostream& os ) { os.flush(); return os; } 169 | KAT_FD kat::printfing_ostream& endl( kat::printfing_ostream& os ) { os << '\n'; os.flush(); return os; } 170 | KAT_FD kat::printfing_ostream& no_prefix( kat::printfing_ostream& os ) { return os.no_prefix(); } 171 | KAT_FD kat::printfing_ostream& no_newline_on_flush( kat::printfing_ostream& os ) { return os.no_newline_on_flush(); } 172 | KAT_FD kat::printfing_ostream& newline_on_flush( kat::printfing_ostream& os ) { return os.append_newline_on_flush(); } 173 | 174 | } // manipulators 175 | 176 | 177 | // This is defined only with __CUDA_ARCH__, since the implementation is actually device-only, 178 | // referring to this->flush(), which can only really run on the device. We could, instead, 179 | // make printfing_ostream::flush() be an STRF_HD (host-and-device) function, which simply 180 | // fails on the host side, but that would be too much of a lie. 181 | #ifdef __CUDA_ARCH__ 182 | 183 | KAT_DEV printfing_ostream::~printfing_ostream() 184 | { 185 | this->flush(); 186 | } 187 | 188 | #endif 189 | 190 | template <> 191 | KAT_DEV printfing_ostream& printfing_ostream::operator<< ( 192 | printfing_ostream::manipulator& manip) 193 | { 194 | return manip(*this); 195 | } 196 | 197 | namespace manipulators { 198 | KAT_DEV auto prefix(prefix_generator_type gen) { 199 | return [gen](kat::printfing_ostream& os) { return os.set_prefix_generator(gen); }; 200 | } 201 | } // namespace manipulators 202 | 203 | // This conditional compilation segment is necessary because NVCC (10.x) will not accept a 204 | // reference-to/address-of a device function except in device function bodies, or when __CUDA_ARCH__ 205 | // is defined. In other words: we have a "device-side 'using' statement" here, followed by the 206 | // operator<<() function which makes use of it. 207 | #ifdef __CUDA_ARCH__ 208 | namespace manipulators { 209 | using prefix_setting_manipulator_type = std::result_of< decltype(&prefix)(prefix_generator_type) >::type; 210 | } // namespace manipulators 211 | 212 | KAT_DEV printfing_ostream& operator<< (printfing_ostream& os, manipulators::prefix_setting_manipulator_type manip) 213 | { 214 | // std::basic_ostream x; 215 | manip(os); 216 | return os; 217 | } 218 | #endif 219 | 220 | namespace manipulators { 221 | KAT_DEV auto resolution(printfing_ostream::resolution new_resolution) { 222 | return [new_resolution](kat::printfing_ostream& os) { return os.set_printing_resolution(new_resolution); }; 223 | } 224 | } // namespace manipulators 225 | 226 | #ifdef __CUDA_ARCH__ 227 | namespace manipulators { 228 | using resolution_setting_manipulator_type = std::result_of< decltype(&resolution)(printfing_ostream::resolution) >::type; 229 | } // namespace manipulators 230 | 231 | KAT_DEV printfing_ostream& operator<< (printfing_ostream& os, manipulators::resolution_setting_manipulator_type manip) 232 | { 233 | manip(os); 234 | return os; 235 | } 236 | #endif 237 | 238 | 239 | using manipulators::flush; 240 | using manipulators::endl; 241 | 242 | } // namespace kat 243 | 244 | #include 245 | 246 | #endif // CUDA_KAT_ON_PRINTFING_STREAM_CUH_ 247 | -------------------------------------------------------------------------------- /src/kat/on_device/streams/stringstream.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/streams/stringstream.cuh 3 | * 4 | * @brief A string stream class for CUDA device-side code (usable by individual threads). 5 | * 6 | * @note This class will likely be rather slow in use: Its code is entirely serial, and it 7 | * uses occasional dynamic memory allocations. You are advised to use it mostly for debugging 8 | * purposes. 9 | */ 10 | #pragma once 11 | #ifndef CUDA_KAT_ON_DEVICE_STRINGSTREAM_CUH_ 12 | #define CUDA_KAT_ON_DEVICE_STRINGSTREAM_CUH_ 13 | 14 | #include 15 | 16 | #include 17 | 18 | // Necessary for printf()'ing in kernel code 19 | #include 20 | 21 | ///@cond 22 | #include 23 | ///@endcond 24 | 25 | namespace kat { 26 | 27 | namespace detail { 28 | 29 | template 30 | KAT_DEV T* safe_malloc(std::size_t size) 31 | { 32 | auto p = malloc(size); 33 | if (p == nullptr) { 34 | asm("trap;"); 35 | } 36 | return static_cast(p); 37 | } 38 | 39 | } 40 | 41 | /** 42 | * An std::stringstream-like class into which one can add formatted 43 | * data using `my_stringstream << my_datum`. It won't accept std::ios 44 | * flags - since we can't depend on host-side-only IOS code - but it 45 | * will accept a bunch of strf equivalents. See: 46 | * 47 | * @url https://robhz786.github.io/strf/doc/quick_reference.html#format_functions 48 | * 49 | * for a list of format functions. 50 | * 51 | * @note This class owns its buffer. 52 | * @note nothing is dynamically allocated if the length is 0 53 | */ 54 | class stringstream: public ::strf::basic_outbuf 55 | { 56 | public: 57 | using char_type = char; 58 | // no traits_type - this is not implemented by strf 59 | // using int_type = int; // really 60 | // using off_type = std::off_t; // really? 61 | using pos_type = std::size_t; 62 | 63 | protected: 64 | // Note: initial_buffer_size + 1 bytes must be allocated 65 | STRF_HD stringstream(char_type* initial_buffer, std::size_t initial_buffer_size) : 66 | buffer_size(initial_buffer_size), 67 | buffer(initial_buffer), 68 | strf::basic_outbuf(initial_buffer, buffer_size) 69 | { 70 | } 71 | 72 | public: 73 | STRF_HD stringstream(std::size_t initial_buffer_size); 74 | 75 | STRF_HD stringstream(stringstream&& other) : strf::basic_outbuf(other.buffer, other.buffer_size) 76 | { 77 | if (buffer != nullptr) { 78 | free(buffer); 79 | } 80 | buffer = other.buffer; 81 | buffer_size = other.buffer_size; 82 | other.buffer = nullptr; 83 | other.buffer_size = 0; 84 | } 85 | 86 | STRF_HD stringstream(const stringstream& other) : stringstream(other.buffer_size) 87 | { 88 | memcpy(buffer, other.buffer, sizeof(char_type) * (buffer_size + 1)); 89 | } 90 | 91 | STRF_HD ~stringstream() 92 | { 93 | if (buffer != nullptr) { 94 | free(buffer); 95 | } 96 | } 97 | 98 | STRF_HD void recycle() override; 99 | 100 | KAT_DEV void clear() 101 | { 102 | set_pos(buffer); 103 | flush(); 104 | } 105 | 106 | KAT_DEV void flush() { 107 | if (buffer != nullptr) { 108 | *pos() = '\0'; 109 | } 110 | } 111 | 112 | // should be able to produce an std-string-like proxy supporting a c_str() method, rather 113 | // than providing a c_str() directly. 114 | 115 | KAT_DEV const char* c_str() 116 | { 117 | flush(); 118 | return buffer; 119 | } 120 | 121 | KAT_DEV pos_type tellp() const { return pos() - buffer; } 122 | KAT_DEV bool empty() const { return tellp() == 0; } 123 | // std::stringstream's don't have this 124 | KAT_DEV stringstream& seekp(pos_type pos) { set_pos(buffer + pos); return *this; } 125 | 126 | KAT_DEV std::size_t capacity() const { return buffer_size; } // perhaps there's something else we can use instead? 127 | 128 | // TO implement (maybe): 129 | // 130 | // seekp 131 | // tellp 132 | // put 133 | // write 134 | // swap <- NO. 135 | // 136 | // good 137 | // eof 138 | // fail 139 | // bad 140 | // operator! 141 | // operator bool 142 | // rdstate 143 | // setstate 144 | // copyfmt 145 | // fill 146 | // exceptions <- No exception support; but might still implement this 147 | // imbue <- No locale support 148 | // tie 149 | // narrow <- No locale support 150 | // widen <- No locale support 151 | // 152 | // flags 153 | // setf 154 | // unsetf 155 | // precision 156 | // width 157 | // imbue <- No locale support 158 | // getloc <- No locale support 159 | // xalloc, iword, pword <- Not relevant on the device side, I think. 160 | // register_callback <- ?? 161 | // sync_with_stdio <- No 162 | // 163 | 164 | 165 | 166 | protected: 167 | // TODO: Write and use a device-side unique_ptr class and use kat::unique_ptr 168 | // instead of these two variables 169 | std::size_t buffer_size; // not including space for a trailing '\0'. 170 | char_type* buffer; 171 | }; 172 | 173 | #ifdef __CUDA_ARCH__ 174 | 175 | STRF_HD stringstream::stringstream(std::size_t initial_buffer_size) 176 | : stringstream( 177 | initial_buffer_size == 0 ? nullptr : detail::safe_malloc(initial_buffer_size + 1), 178 | initial_buffer_size) 179 | { 180 | } 181 | 182 | KAT_DEV void stringstream::recycle() 183 | { 184 | std::size_t used_size = (buffer_size == 0) ? 0 : (this->pos() - buffer); 185 | // a postcondition of recycle() is that at least so much free space is available. 186 | auto new_buffer_size = builtins::maximum( 187 | buffer_size * 2, 188 | used_size + strf::min_size_after_recycle()); 189 | auto new_buff = detail::safe_malloc(new_buffer_size + 1); 190 | if (buffer != nullptr) { 191 | memcpy(new_buff, buffer, sizeof(char_type) * used_size); 192 | free(buffer); 193 | } 194 | this->set_pos(new_buff + used_size); 195 | this->set_end(new_buff + new_buffer_size); 196 | buffer = new_buff; 197 | } 198 | #endif 199 | 200 | 201 | template 202 | KAT_DEV stringstream& operator<<(stringstream& out, const T& arg) 203 | { 204 | if (out.capacity() == 0) { 205 | // We should not need to do the following. However, for some reason, make_printer(...).print_to(out) 206 | // will fail on empty (nullptr) buffers; so we might end up "recycle()ing" more than once for the sam 207 | // streaming operation. 208 | out.recycle(); 209 | } 210 | 211 | // TODO: 212 | // 1. Can `no_preview` be made constant? 213 | // 2. Can't we target a specific overload rather than play with ranks? 214 | auto no_preview = ::strf::print_preview{}; 215 | ::strf::make_printer( 216 | ::strf::rank<5>(), 217 | // strf::rank is a method for controlling matching within the overload set: 218 | // rank objects have no members, it's only about their type. Higher rank objects can 219 | // match lower-rank objects (i.e. match functions in the overload sets expecting lower-rank 220 | // objects), which means they have access to more of the overload sets. If we create 221 | // a lower-rank object here we will only be able to match a few overload set members. 222 | ::strf::pack(), 223 | // not modifying any facets such as digit grouping or digit separator 224 | no_preview, 225 | // Don't know what this means actually 226 | arg 227 | ).print_to(out); 228 | 229 | // Note: This function doesn't actually rely on out being a stringstream; any 230 | // ostream-like class would do. But for now, we don't have any ostreams other 231 | // than the stringstream, so we'll leave it this way. Later, with could either 232 | // have an intermediate class, or wrap basic_outbuf with an ostream class 233 | // without a buffer, or just call basic_outbuf an ostream 234 | 235 | return out; 236 | } 237 | 238 | } // namespace kat 239 | #endif // CUDA_KAT_ON_DEVICE_STRINGSTREAM_CUH_ 240 | 241 | -------------------------------------------------------------------------------- /src/kat/on_device/time.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file on_device/time.cuh 3 | * 4 | * @brief CUDA device-side functions having to do with timing and the hardware clock. 5 | */ 6 | 7 | #ifndef CUDA_KAT_ON_DEVICE_TIME_CUH_ 8 | #define CUDA_KAT_ON_DEVICE_TIME_CUH_ 9 | 10 | #include 11 | 12 | ///@cond 13 | #include 14 | ///@endcond 15 | 16 | namespace kat { 17 | 18 | enum class sleep_resolution { clock_cycles, nanoseconds }; 19 | 20 | using clock_value_t = long long int; 21 | 22 | static_assert(std::is_same< decltype(clock64()), clock_value_t>::value , "Unexpected clock function result type"); 23 | // CUDA uses a signed type for clock values - for some unknown reason; See the declaration of clock64() 24 | 25 | ///@cond 26 | namespace detail { 27 | 28 | template 29 | struct sleep_unit; 30 | 31 | template<> struct sleep_unit { using type = clock_value_t; }; 32 | template<> struct sleep_unit { using type = unsigned int; }; 33 | // Why unsigned int? See the declaration of nanosleep()... 34 | 35 | } // namespace detail 36 | ///@endcond 37 | 38 | template 39 | using sleep_unit_t = typename detail::sleep_unit::type; 40 | 41 | 42 | /** 43 | * @brief Have the calling warp busy-sleep for (at least) a certain 44 | * number of clock cycles. 45 | * 46 | * @note In 2017, a typical GPU clock cycle is around 1 ns (i.e. 1 GHz frequency). 47 | * 48 | */ 49 | template 50 | KAT_DEV void sleep(sleep_unit_t num_cycles) = delete; 51 | 52 | template<> 53 | KAT_DEV void sleep( 54 | sleep_unit_t num_cycles) 55 | { 56 | // The clock64() function returns an SM-specific clock ticks value, 57 | // which occasionally gets reset. Even if it were not reset, it would 58 | // only wrap around in 300 years or so since it began ticking, which is 59 | // why there's no need to check for wrap-around. 60 | // Also, it seems this code is not optimized-away despite not having 61 | // any obvious side effects. 62 | clock_value_t start = clock64(); 63 | clock_value_t cycles_elapsed; 64 | do { cycles_elapsed = clock64() - start; } 65 | while (cycles_elapsed < num_cycles); 66 | } 67 | 68 | #if __CUDA_ARCH__ >= 700 69 | 70 | template<> 71 | KAT_DEV void sleep( 72 | sleep_unit_t num_cycles) 73 | { 74 | __nanosleep(unsigned int ns); 75 | } 76 | 77 | #endif // __CUDA_ARCH__ >= 700 78 | 79 | } // namespace kat 80 | 81 | #endif // CUDA_KAT_ON_DEVICE_TIME_CUH_ 82 | -------------------------------------------------------------------------------- /src/kat/reference_wrapper.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file kat/reference_wrapper.hpp 3 | * 4 | * @brief This file implements `kat::reference_wrapper`, an equivalent of 5 | * C++11's `std::reference_wrapper` which may be used both in host-side and 6 | * CUDA-device-side code. 7 | */ 8 | 9 | // 10 | // Original code Copyright (c) Electronic Arts Inc. All rights reserved 11 | // Modifications Copyright (c) 2020 Eyal Rozenberg. 12 | // 13 | // Redistribution and use in source and binary forms, with or without 14 | // modification, are permitted provided that the following conditions are met: 15 | // 16 | // 1. Redistributions of source code must retain the above copyright notice, this 17 | // list of conditions and the following disclaimer. 18 | // 19 | // 2. Redistributions in binary form must reproduce the above copyright notice, 20 | // this list of conditions and the following disclaimer in the documentation 21 | // and/or other materials provided with the distribution. 22 | // 23 | // 3. Neither the name of the copyright holder nor the names of its 24 | // contributors may be used to endorse or promote products derived from 25 | // this software without specific prior written permission. 26 | // 27 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 30 | // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 31 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 33 | // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 34 | // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 35 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Note: Retrieved from https://github.com/electronicarts/EASTL/ , master branch, 39 | // on 2020-03-11. 40 | 41 | 42 | #ifndef CUDA_KAT_REFERENCE_WRAPPER_HPP_ 43 | #define CUDA_KAT_REFERENCE_WRAPPER_HPP_ 44 | 45 | #include 46 | #include 47 | #include 48 | 49 | namespace kat { 50 | 51 | /// reference_wrapper 52 | template 53 | class reference_wrapper 54 | { 55 | public: 56 | typedef T type; 57 | 58 | KAT_HD reference_wrapper(T&) noexcept; 59 | KAT_HD reference_wrapper(T&&) = delete; 60 | KAT_HD reference_wrapper(const reference_wrapper& x) noexcept; 61 | 62 | KAT_HD reference_wrapper& operator=(const reference_wrapper& x) noexcept; 63 | 64 | KAT_HD operator T& () const noexcept; 65 | KAT_HD T& get() const noexcept; 66 | 67 | template 68 | KAT_HD typename std::result_of::type operator() (ArgTypes&&...) const; 69 | 70 | private: 71 | T* val; 72 | }; 73 | 74 | template 75 | KAT_HD reference_wrapper::reference_wrapper(T &v) noexcept 76 | // Originally, EASTL has: 77 | // 78 | // : val(addressof(v)) 79 | // 80 | // here. But we can't use std::addressof, since it is not accessible in device-side code; 81 | // and we don't have the utility functions in implemented in device-and-host versions. 82 | // So - we'll just inline an implementation of std::addressof() here instead 83 | : val( 84 | reinterpret_cast( 85 | &const_cast( 86 | reinterpret_cast(v) 87 | ) 88 | ) 89 | ) 90 | {} 91 | 92 | template 93 | KAT_HD reference_wrapper::reference_wrapper(const reference_wrapper& other) noexcept 94 | : val(other.val) 95 | {} 96 | 97 | template 98 | KAT_HD reference_wrapper& reference_wrapper::operator=(const reference_wrapper& other) noexcept 99 | { 100 | val = other.val; 101 | return *this; 102 | } 103 | 104 | template 105 | KAT_HD reference_wrapper::operator T&() const noexcept 106 | { 107 | return *val; 108 | } 109 | 110 | template 111 | KAT_HD T& reference_wrapper::get() const noexcept 112 | { 113 | return *val; 114 | } 115 | 116 | template 117 | template 118 | KAT_HD typename std::result_of::type reference_wrapper::operator() (ArgTypes&&... args) const 119 | { 120 | // return std::invoke(*val, std::forward(args)...); 121 | return *val(std::forward(args)...); 122 | } 123 | 124 | // reference_wrapper-specific utilties 125 | template 126 | KAT_HD reference_wrapper ref(T& t) noexcept 127 | { 128 | return kat::reference_wrapper(t); 129 | } 130 | 131 | template 132 | KAT_HD void ref(const T&&) = delete; 133 | 134 | template 135 | KAT_HD reference_wrapper ref(reference_wrappert) noexcept 136 | { 137 | return kat::ref(t.get()); 138 | } 139 | 140 | template 141 | KAT_HD reference_wrapper cref(const T& t) noexcept 142 | { 143 | return kat::reference_wrapper(t); 144 | } 145 | 146 | template 147 | KAT_HD void cref(const T&&) = delete; 148 | 149 | template 150 | KAT_HD reference_wrapper cref(reference_wrapper t) noexcept 151 | { 152 | return kat::cref(t.get()); 153 | } 154 | 155 | 156 | // reference_wrapper-specific type traits 157 | template 158 | struct is_reference_wrapper_helper 159 | : public std::false_type {}; 160 | 161 | template 162 | struct is_reference_wrapper_helper > 163 | : public std::true_type {}; 164 | 165 | template 166 | struct is_reference_wrapper 167 | : public kat::is_reference_wrapper_helper::type> {}; 168 | 169 | 170 | // Helper which adds a reference to a type when given a reference_wrapper of that type. 171 | template 172 | struct remove_reference_wrapper 173 | { typedef T type; }; 174 | 175 | template 176 | struct remove_reference_wrapper< kat::reference_wrapper > 177 | { typedef T& type; }; 178 | 179 | template 180 | struct remove_reference_wrapper< const kat::reference_wrapper > 181 | { typedef T& type; }; 182 | 183 | /* 184 | // reference_wrapper specializations of invoke 185 | // These have to come after reference_wrapper is defined, but reference_wrapper needs to have a 186 | // definition of invoke, so these specializations need to come after everything else has been defined. 187 | template 188 | auto invoke_impl(R (C::*func)(Args...), T&& obj, Args&&... args) -> 189 | typename std::enable_if::type>::value, 190 | decltype((obj.get().*func)(std::forward(args)...))>::type 191 | { 192 | return (obj.get().*func)(std::forward(args)...); 193 | } 194 | 195 | template 196 | auto invoke_impl(M(C::*member), T&& obj) -> 197 | typename enable_if::type>::value, 198 | decltype(obj.get().*member)>::type 199 | { 200 | return obj.get().*member; 201 | } 202 | */ 203 | 204 | } // namespace kat 205 | 206 | #endif // CUDA_KAT_REFERENCE_WRAPPER_HPP_ 207 | -------------------------------------------------------------------------------- /src/kat/utility.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file kat/utility.hpp 3 | * 4 | * @brief An adaptation for host-and-device use of some 5 | * of the standard C++ library's `` code. 6 | */ 7 | #pragma once 8 | #ifndef CUDA_KAT_UTILITY_HPP_ 9 | #define CUDA_KAT_UTILITY_HPP_ 10 | 11 | #include 12 | 13 | #include 14 | #include // Mainly so that KAT code can our header as a drop-in for itself 15 | 16 | ///@cond 17 | #include 18 | ///@endcond 19 | #include 20 | 21 | 22 | namespace kat { 23 | 24 | #ifdef KAT_DEFINE_MOVE_AND_FORWARD 25 | template 26 | constexpr KAT_FHD typename std::remove_reference::type&& move(T&& v) noexcept 27 | { 28 | return static_cast::type&&>(v); 29 | } 30 | 31 | template 32 | constexpr KAT_FHD T&& forward(typename std::remove_reference::type& v) noexcept 33 | { 34 | return static_cast(v); 35 | } 36 | 37 | template 38 | constexpr KAT_FHD T&& forward(typename std::remove_reference::type&& v) noexcept 39 | { 40 | return static_cast(v); 41 | } 42 | #endif 43 | 44 | #if __cplusplus >= 201401L 45 | template 46 | constexpr KAT_FHD auto exchange (T& x, U&& new_value) // TODO: A noexcept clause? 47 | { 48 | #ifndef KAT_DEFINE_MOVE_AND_FORWARD 49 | using std::move; 50 | using std::forward; 51 | #endif 52 | auto old_value = move(x); 53 | x = forward(new_value); 54 | return old_value; 55 | } 56 | #endif // __cplusplus >= 201401L 57 | 58 | /** 59 | * @brief Swap two values on the device-side, in-place. 60 | * 61 | * @note A (CUDA, or any other) compiler will often not actually 62 | * emit any code when this function is used. Instead, it will use 63 | * one argument instead of the other in later code, i.e. "swap" 64 | * them in its own internal figuring. 65 | * 66 | * @note Is this enough, without the multiple specializations for std::swap? 67 | * @todo How does EASTL swap work? Should I incorporate its specializations? 68 | * 69 | * @note Some kat types overload this default implementation. 70 | * 71 | */ 72 | template 73 | KAT_FHD CONSTEXPR_SINCE_CPP_14 void swap( T& a, T& b ) 74 | noexcept( 75 | std::is_nothrow_move_constructible::value && 76 | std::is_nothrow_move_assignable::value 77 | ) 78 | { 79 | #ifndef KAT_DEFINE_MOVE_AND_FORWARD 80 | using std::move; 81 | #endif 82 | T tmp ( move(a) ); 83 | a = move(b); 84 | b = move(tmp); 85 | } 86 | 87 | namespace detail { 88 | 89 | template 90 | struct addr_impl_ref 91 | { 92 | T& v_; 93 | 94 | KAT_FHD addr_impl_ref( T& v ): v_( v ) {} 95 | KAT_FHD operator T& () const { return v_; } 96 | 97 | private: 98 | KAT_FHD addr_impl_ref & operator=(const addr_impl_ref &); 99 | }; 100 | 101 | template 102 | struct addressof_impl 103 | { 104 | static KAT_FHD T* f( T& v, long ) { 105 | return reinterpret_cast( 106 | &const_cast(reinterpret_cast(v))); 107 | } 108 | 109 | static KAT_FHD T* f( T* v, int ) { return v; } 110 | }; 111 | 112 | } // namespace detail 113 | 114 | /** 115 | * @brief Obtains the actual address of the object or function arg, even in presence of overloaded `operator&()` 116 | * 117 | * @note In the standard library, this function is somehow in @ref ``. 118 | * 119 | * @{ 120 | */ 121 | template 122 | KAT_FHD T* addressof( T& v ) { 123 | // Note the complex implementation details are due to some objects 124 | // overloading their & operator 125 | return detail::addressof_impl::f( detail::addr_impl_ref( v ), 0 ); 126 | } 127 | 128 | /** @} */ 129 | template 130 | const KAT_FHD T* addressof(const T&&) = delete; 131 | 132 | } // namespace kat 133 | 134 | #endif // CUDA_KAT_UTILITY_HPP_ 135 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8.2) 2 | 3 | 4 | ############### 5 | ## Modules ## 6 | ############### 7 | 8 | # Standard CMake modules 9 | 10 | # Custom modules 11 | 12 | include(DocTest) 13 | 14 | ################ 15 | ## Packages ## 16 | ################ 17 | 18 | find_package(CUDA 8.0 REQUIRED) 19 | find_package(cuda-api-wrappers 0.3.0 REQUIRED) 20 | find_package(cuda-nvtx REQUIRED) # Actually, it's sort-of required by cuda-api-wrappers 21 | 22 | include(CMakeDependentOption) 23 | CMAKE_DEPENDENT_OPTION(BUILD_PRINTING_RELATED_TESTS "Build (strf-based) printing-related tests" ON "BUILD_TESTS" ON) 24 | set(KEEP_PTX FALSE CACHE BOOL "Keep kernel PTX files for build targets") 25 | 26 | # This overcomes some linking issues I've encountered... I'm sure there's a better solution 27 | set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_NVTX_LIBRARY} ${CUDA_cudadevrt_LIBRARY}) 28 | 29 | 30 | ############# 31 | ## Tests ## 32 | ############# 33 | 34 | cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS_TMP Auto) 35 | set(CUDA_ARCH_FLAGS ${CUDA_ARCH_FLAGS_TMP} CACHE STRING "CUDA gencode parameters") 36 | string(REPLACE ";" " " CUDA_ARCH_FLAGS_STR "${CUDA_ARCH_FLAGS}") 37 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS_STR}") 38 | 39 | #add_test( 40 | # # Use some per-module/project prefix so that it is easier to run only tests for this module 41 | # NAME ${PROJECT_NAME}.failtest 42 | # COMMAND failtest ${TEST_RUNNER_PARAMS} 43 | #) 44 | ##target_set_warnings(${TEST_MAIN} ENABLE ALL AS_ERROR ALL DISABLE Annoying) # Set warnings (if needed). 45 | #set_tests_properties( 46 | # ${PROJECT_NAME}.failtest 47 | # PROPERTIES 48 | # WILL_FAIL TRUE # We expect this test to fail 49 | #) 50 | 51 | add_library(test_utils util/random.cu) 52 | set_target_properties( 53 | test_utils 54 | PROPERTIES 55 | CXX_STANDARD 14 56 | CXX_STANDARD_REQUIRED YES 57 | CXX_EXTENSIONS NO 58 | ) 59 | 60 | set(tests 61 | array 62 | shared_memory 63 | math 64 | shuffle 65 | atomics 66 | constexpr_math 67 | time 68 | c_string 69 | span 70 | miscellany 71 | builtins 72 | grid_collaboration 73 | block_collaboration 74 | warp_collaboration 75 | tuple 76 | sequence_ops 77 | ) 78 | 79 | if (BUILD_PRINTING_RELATED_TESTS) 80 | list(APPEND tests printing) 81 | endif() 82 | 83 | foreach(TEST_TARGET ${tests}) 84 | add_executable(${TEST_TARGET} "${TEST_TARGET}.cu") 85 | target_compile_options(${TEST_TARGET} PRIVATE "--expt-relaxed-constexpr") 86 | target_compile_options(${TEST_TARGET} PRIVATE "--expt-extended-lambda") 87 | target_link_libraries(${TEST_TARGET} PRIVATE cuda-kat cuda-api-wrappers::cuda-api-wrappers doctest ${CUDA_LIBRARIES} test_utils) 88 | # I don't see why the following line should even be necessary. Depending on the libraries should be enough to get us their include dirs! 89 | target_include_directories(${TEST_TARGET} PRIVATE ${PROJECT_SOURCE_DIR}/src) 90 | 91 | set_target_properties( 92 | ${TEST_TARGET} 93 | PROPERTIES 94 | CXX_STANDARD 14 95 | CXX_STANDARD_REQUIRED YES 96 | CXX_EXTENSIONS NO 97 | ) 98 | add_test( 99 | NAME ${PROJECT_NAME}.${TEST_TARGET} 100 | COMMAND ${TEST_TARGET} ${TEST_RUNNER_PARAMS} 101 | ) 102 | if (KEEP_PTX) 103 | target_compile_options(${TEST_TARGET} PRIVATE "--keep") 104 | endif() 105 | endforeach(TEST_TARGET) 106 | 107 | if (BUILD_PRINTING_RELATED_TESTS) 108 | target_link_libraries(printing PRIVATE strf::strf-header-only) 109 | target_compile_options(printing PRIVATE --ptxas-options --suppress-stack-size-warning) 110 | endif() 111 | 112 | # TODO: Something about code coverage perhaps? 113 | -------------------------------------------------------------------------------- /tests/common.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_KAT_TESTS_COMMON_CUH 2 | #define CUDA_KAT_TESTS_COMMON_CUH 3 | 4 | #include "util/prettyprint.hpp" 5 | #include "util/type_name.hpp" 6 | #include "util/random.hpp" 7 | #include "util/miscellany.cuh" 8 | #include "util/macro.h" 9 | #include "util/printing.hpp" 10 | 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #endif // CUDA_KAT_TESTS_COMMON_CUH 23 | -------------------------------------------------------------------------------- /tests/shared_memory.cu: -------------------------------------------------------------------------------- 1 | #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN 2 | 3 | #include "common.cuh" 4 | 5 | #include 6 | #include 7 | 8 | using namespace kat; 9 | 10 | using shmem_size_t = shared_memory::size_t; 11 | 12 | 13 | struct sizes_t { 14 | shmem_size_t dynamic; 15 | shmem_size_t static_; 16 | shmem_size_t total; 17 | }; 18 | 19 | namespace kernels { 20 | 21 | 22 | template 23 | __global__ void determine_sizes(sizes_t* results) 24 | { 25 | static __shared__ char arr[StaticSize]; 26 | arr[0] = 0; 27 | arr[1] = arr[0]; 28 | results->dynamic = shared_memory::dynamic::size(); 29 | results->static_ = shared_memory::static_::size(); 30 | results->total = shared_memory::size(); 31 | } 32 | 33 | template <> 34 | __global__ void determine_sizes<0>(sizes_t* results) 35 | { 36 | results->dynamic = shared_memory::dynamic::size(); 37 | results->static_ = shared_memory::static_::size(); 38 | results->total = shared_memory::size(); 39 | } 40 | 41 | template 42 | __global__ void check_overlap(shmem_size_t num_elements_per_warp, shmem_size_t* num_overlaps_encountered_by_warp) 43 | { 44 | auto warp_shared_mem = shared_memory::dynamic::warp_specific::contiguous(num_elements_per_warp); 45 | 46 | // Note: The rest of this kernel will use as little as possible kat functionality, so as not 47 | // to mix up testing different parts of the library. The price is some idiosyncracy. Also, 48 | // we'll let just a single thread of each warp act, so as not to worry about intra-warp collaboration. 49 | 50 | auto am_first_in_warp = (threadIdx.x % warp_size == 0); 51 | if (not am_first_in_warp) { return; } 52 | 53 | // clear the warp's shared memory 54 | for(shmem_size_t i = 0; i < num_elements_per_warp; i++) { warp_shared_mem[i] = I{0}; } 55 | __syncthreads(); 56 | 57 | // touch every I-element in this warp's shared memory, in a way in which overlaps between warps' 58 | // shared memory stretches would be detected 59 | 60 | for(shmem_size_t i = 0; i < num_elements_per_warp; i++) { 61 | atomic::increment(&(warp_shared_mem[i])); 62 | } 63 | __syncthreads(); 64 | 65 | // This could have been an std::count_if 66 | 67 | shmem_size_t num_overlaps_encountered { 0 }; 68 | for(shmem_size_t i = 0; i < num_elements_per_warp; i++) { 69 | if (warp_shared_mem[i] != (I{1})) { num_overlaps_encountered++; } 70 | } 71 | auto warp_index = threadIdx.x / warp_size; 72 | num_overlaps_encountered_by_warp[warp_index] = num_overlaps_encountered; 73 | } 74 | 75 | 76 | } // namespace kernels 77 | 78 | 79 | TEST_SUITE("shared_memory") { 80 | 81 | TEST_CASE("correctly determining static and dynamic sizes") 82 | { 83 | constexpr const shmem_size_t allocation_quantum { 256 }; 84 | // It seems that static shared memory is allocated in quanta; and that the dynamic shared memory 85 | // can fill in the gap in the last quantum if necessary 86 | 87 | constexpr const shmem_size_t dynamic_shmem_sizes[] = { 0, 1, allocation_quantum, allocation_quantum+1 }; 88 | constexpr const shmem_size_t used_static_shmem_sizes[] = { 1, allocation_quantum - 1, allocation_quantum, allocation_quantum + 1 }; 89 | 90 | // Target architecture Shared memory allocation unit size 91 | // sm_2x 128 bytes 92 | // sm_3x, sm_5x, sm_6x, sm_7x 256 bytes 93 | 94 | 95 | auto device { cuda::device::current::get() }; 96 | auto device_side_results { cuda::memory::device::make_unique(device) }; 97 | for (auto dynamic_shared_mem_size : dynamic_shmem_sizes) { 98 | // TODO: The following should really be a "for constexpr" - but that doesn't exist yet 99 | for (auto i = 0; i < array_length(used_static_shmem_sizes); i++) { 100 | auto launch_config { cuda::make_launch_config(1, 1, dynamic_shared_mem_size) }; 101 | sizes_t host_side_results; 102 | switch(i) { 103 | case 0: cuda::launch(kernels::determine_sizes, launch_config, device_side_results.get()); break; 104 | case 1: cuda::launch(kernels::determine_sizes, launch_config, device_side_results.get()); break; 105 | case 2: cuda::launch(kernels::determine_sizes, launch_config, device_side_results.get()); break; 106 | case 3: cuda::launch(kernels::determine_sizes, launch_config, device_side_results.get()); break; 107 | } 108 | auto static_shared_mem_size { used_static_shmem_sizes[i] }; 109 | auto aligned_total_size { round_up(static_shared_mem_size + dynamic_shared_mem_size, allocation_quantum) }; 110 | cuda::memory::copy(&host_side_results, device_side_results.get(), sizeof(sizes_t)); 111 | CHECK(host_side_results.dynamic == dynamic_shared_mem_size); 112 | // TODO: Figure out the exact rule for how much static shared memory is actually allocated. Apparently 113 | // it depends on the existence of other kernels (???) 114 | // CHECK(host_side_results.static_ == aligned_total_size - dynamic_shared_mem_size); 115 | // CHECK(host_side_results.total == aligned_total_size); 116 | } 117 | } 118 | } 119 | 120 | TEST_CASE_TEMPLATE("allocations of per-warp shared memory do not intersect", I, int32_t, int64_t) 121 | { 122 | cuda::device_t device { cuda::device::current::get() }; 123 | auto max_shared_mem = device.properties().sharedMemPerBlock; 124 | auto num_warps = device.properties().max_warps_per_block(); 125 | shmem_size_t shared_mem_per_warp = max_shared_mem / num_warps; 126 | shmem_size_t num_shmem_elements_per_warp = shared_mem_per_warp / sizeof(I); 127 | auto block_size = num_warps * warp_size; 128 | auto launch_config { cuda::make_launch_config(1, block_size, num_shmem_elements_per_warp * sizeof(I) * num_warps) }; 129 | auto device_side_results { cuda::memory::device::make_unique(device, num_warps) }; 130 | auto host_side_results { std::unique_ptr(new shmem_size_t[num_warps]) }; 131 | cuda::launch(kernels::check_overlap, launch_config, num_shmem_elements_per_warp, device_side_results.get()); 132 | cuda::memory::copy(host_side_results.get(), device_side_results.get(), sizeof(shmem_size_t) * num_warps); 133 | auto num_overlaps_found = std::accumulate(host_side_results.get(), host_side_results.get() + num_warps, 0); 134 | CHECK(num_overlaps_found == 0); 135 | } 136 | 137 | } // TEST_SUITE("shared_memory") 138 | -------------------------------------------------------------------------------- /tests/time.cu: -------------------------------------------------------------------------------- 1 | #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN 2 | // #include "common.cuh" 3 | //#include "util/prettyprint.hpp" 4 | #include "util/type_name.hpp" 5 | //#include "util/random.hpp" 6 | //#include "util/miscellany.cuh" 7 | //#include "util/macro.h" 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | constexpr const auto num_grid_blocks { 2 }; 17 | constexpr const auto block_size { kat::warp_size + 1 }; 18 | 19 | constexpr const auto sleep_elongation_multiplicative_factor { 10000 }; 20 | // we want each sleep command to take a not-insignificant amount of time 21 | 22 | constexpr const auto sleep_elongation_additive_factor { 10 }; 23 | // we want each sleep command to take a not-insignificant amount of time 24 | 25 | 26 | namespace kernels { 27 | 28 | template 29 | __global__ void measure_time_and_sleep( 30 | kat::clock_value_t* __restrict__ times_before_sleep, 31 | kat::clock_value_t* __restrict__ times_after_sleep, 32 | std::size_t total_num_threads 33 | ) 34 | { 35 | auto global_thread_id = threadIdx.x + blockIdx.x * blockDim.x; 36 | if (global_thread_id >= total_num_threads) { return; } 37 | auto time_before_sleep = clock64(); 38 | auto sleep_duration = 39 | (global_thread_id + sleep_elongation_additive_factor ) * sleep_elongation_multiplicative_factor; 40 | if (Resolution == kat::sleep_resolution::nanoseconds) { 41 | #if __CUDA_ARCH__ >= 700 42 | kat::sleep(sleep_duration); 43 | #else 44 | // we won't break the compilation; it's up to the host-side test code to not run this. 45 | asm("trap;"); 46 | #endif 47 | } 48 | else { 49 | kat::sleep(sleep_duration); 50 | } 51 | kat::collaborative::block::barrier(); 52 | auto time_after_sleep = clock64(); 53 | times_before_sleep[global_thread_id] = time_before_sleep; 54 | times_after_sleep[global_thread_id] = time_after_sleep; 55 | // thread_printf("Have slept for %u units. Time before sleep = %20lld, after = %20lld", 56 | // (unsigned) sleep_duration, time_before_sleep, time_after_sleep); 57 | } 58 | 59 | } // namespace kernels 60 | 61 | 62 | template 63 | struct value_as_type { 64 | static constexpr const T value { Value }; 65 | }; 66 | 67 | TEST_SUITE("time") { 68 | 69 | TEST_CASE_TEMPLATE("measure_time_and_sleep", ResolutionValueAsType, 70 | value_as_type, 71 | value_as_type) 72 | { 73 | constexpr const kat::sleep_resolution resolution { ResolutionValueAsType::value }; 74 | 75 | auto device { cuda::device::current::get() }; 76 | // TODO: Test shuffles with non-full warps. 77 | if ((device.properties().compute_architecture().major < 7) and 78 | (resolution == kat::sleep_resolution::nanoseconds)) 79 | { 80 | // nanosecond-resolution sleep is only supported starting from Volta/Turing 81 | return; 82 | } 83 | device.reset(); 84 | auto launch_config { cuda::make_launch_config(num_grid_blocks, block_size) }; 85 | std::size_t total_num_threads = launch_config.grid_dimensions.volume() * launch_config.block_dimensions.volume(); 86 | auto times_before_sleep = cuda::memory::device::make_unique(device, total_num_threads); 87 | auto times_after_sleep = cuda::memory::device::make_unique(device, total_num_threads); 88 | auto kernel = ::kernels::measure_time_and_sleep; 89 | cuda::launch(kernel, launch_config, 90 | times_before_sleep.get(), times_after_sleep.get(), total_num_threads); 91 | cuda::outstanding_error::ensure_none(); 92 | auto host_times_before_sleep = std::make_unique(total_num_threads); 93 | auto host_times_after_sleep = std::make_unique(total_num_threads); 94 | cuda::memory::copy(host_times_before_sleep.get(), times_before_sleep.get(), total_num_threads * sizeof(kat::clock_value_t)); 95 | cuda::memory::copy(host_times_after_sleep.get(), times_after_sleep.get(), total_num_threads * sizeof(kat::clock_value_t)); 96 | 97 | device.synchronize(); 98 | 99 | for(cuda::grid::block_dimension_t block_id = 0; block_id < num_grid_blocks; block_id++) { 100 | 101 | std::vector block_times_before_sleep { 102 | host_times_before_sleep.get() + block_id * block_size, 103 | host_times_before_sleep.get() + (block_id+1) * block_size 104 | }; 105 | std::vector block_times_after_sleep { 106 | host_times_after_sleep.get() + block_id * block_size, 107 | host_times_after_sleep.get() + (block_id+1) * block_size 108 | }; 109 | 110 | // std::cout << "Resolution: " 111 | // << (resolution == kat::sleep_resolution::clock_cycles ? "clock_cycles" : "") 112 | // << (resolution == kat::sleep_resolution::nanoseconds ? "nanoseconds" : "") 113 | // << std::endl; 114 | 115 | for(cuda::grid::dimension_t thread_index = 0; thread_index < block_size; ++thread_index) { 116 | CHECK(block_times_before_sleep[thread_index] < block_times_after_sleep[thread_index]); 117 | // std::cout 118 | // << "Block " << std::setw(4) << block_id << ", Thread " << std::setw(4) << thread_index << ": " 119 | // << "Before sleep: " << std::setw(20) << tbs[thread_index] << ' ' 120 | // << "After sleep: " << std::setw(20) << tas[thread_index] << std::endl; 121 | } 122 | 123 | auto max_time_before_sleep = *std::max_element(block_times_before_sleep.begin(), block_times_before_sleep.end()); 124 | auto min_time_after_sleep = *std::min_element(block_times_after_sleep.begin(), block_times_after_sleep.end()); 125 | CHECK_LT(max_time_before_sleep, min_time_after_sleep); 126 | 127 | // std::cout 128 | // << " Max time before sleep: " << std::setw(20) << max_time_before_sleep 129 | // << " Min time after sleep: " << std::setw(20) << min_time_after_sleep << std::endl; 130 | } 131 | } 132 | 133 | } // TEST_SUITE("time") 134 | -------------------------------------------------------------------------------- /tests/util/cpu_builtin_equivalents.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_KAT_TEST_UTIL_CPU_BUILTIN_EQUIVALENTS_HPP_ 2 | #define CUDA_KAT_TEST_UTIL_CPU_BUILTIN_EQUIVALENTS_HPP_ 3 | 4 | #include 5 | 6 | 7 | template 8 | constexpr inline I absolute_value(I x) 9 | { 10 | static_assert(std::is_integral::value, "Only to be used for integral types"); 11 | return x > 0 ? x : I(-x); 12 | } 13 | 14 | template <> constexpr inline float absolute_value(float x) { return std::abs(x); } 15 | template <> constexpr inline double absolute_value(double x) { return std::abs(x); } 16 | 17 | namespace detail { 18 | template 19 | constexpr inline std::make_unsigned_t absolute_difference(std::false_type, I x, I y) 20 | { 21 | // unsigned case 22 | return x < y ? y-x : x-y; 23 | } 24 | 25 | template 26 | constexpr inline std::make_unsigned_t absolute_difference(std::true_type, I x, I y) 27 | { 28 | // signed case 29 | 30 | auto have_same_sign = (x > 0) == (y > 0); 31 | if (have_same_sign) { 32 | return x < y ? y-x : x-y; 33 | } 34 | using uint_t = std::make_unsigned_t; 35 | return x < y ? 36 | uint_t(-x) + uint_t(y) : 37 | uint_t(x) + uint_t(-y); 38 | } 39 | 40 | } // namespace detail 41 | 42 | // This may be a a poor implementation, don't use it elsewhere 43 | template 44 | constexpr inline std::make_unsigned_t absolute_difference(I x, I y) 45 | { 46 | static_assert(std::is_integral::value, "Only to be used for integral types"); 47 | using is_signed = std::integral_constant::value>; 48 | return detail::absolute_difference(is_signed{}, x, y); 49 | } 50 | 51 | 52 | 53 | template int population_count(I x) 54 | { 55 | static_assert(std::is_integral::value, "Only integral types are supported"); 56 | static_assert(sizeof(I) <= sizeof(unsigned long long), "Unexpectedly large type"); 57 | 58 | using native_popc_type = 59 | typename std::conditional< 60 | sizeof(I) <= sizeof(unsigned), 61 | unsigned, 62 | unsigned long long 63 | >::type; 64 | return population_count(static_cast(x)); 65 | } 66 | 67 | template int population_count(I x); 68 | 69 | template<> inline int population_count(unsigned x) { return __builtin_popcount(x); } 70 | template<> inline int population_count(unsigned long x) { return __builtin_popcountl(x); } 71 | template<> inline int population_count(unsigned long long x) { return __builtin_popcountll(x); } 72 | 73 | template inline I bit_reverse(I x) 74 | { 75 | static_assert(std::is_integral::value and sizeof(I) <= 8, "bit_reverse is only available for integers with 64 bits or less"); 76 | switch(sizeof(I)) { 77 | case 1: return bit_reverse(reinterpret_cast(x)); 78 | case 2: return bit_reverse(reinterpret_cast(x)); 79 | case 4: return bit_reverse(reinterpret_cast(x)); 80 | default: return bit_reverse(reinterpret_cast(x)); 81 | } 82 | } 83 | 84 | template <> 85 | inline uint8_t bit_reverse(uint8_t x) 86 | { 87 | static unsigned char lookup[16] = { 88 | 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, 89 | 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, 90 | }; 91 | 92 | // Reverse top half, reverse lower half, and swap them. 93 | return (lookup[x & 0b1111] << 4) | lookup[x >> 4]; 94 | } 95 | 96 | template <> 97 | inline uint16_t bit_reverse(uint16_t x) 98 | { 99 | return (bit_reverse(x & 0xFF) << 8) | bit_reverse(x >> 8); 100 | } 101 | 102 | 103 | template <> 104 | inline uint32_t bit_reverse(uint32_t x) 105 | { 106 | return (bit_reverse(x & 0xFFFF) << 16) | bit_reverse(x >> 16); 107 | } 108 | 109 | template <> 110 | inline uint64_t bit_reverse(uint64_t x) 111 | { 112 | return (uint64_t{bit_reverse(x & 0xFFFFFFFF)} << 32) | bit_reverse(x >> 32); 113 | } 114 | 115 | 116 | 117 | #endif // CUDA_KAT_TEST_UTIL_CPU_BUILTIN_EQUIVALENTS_HPP_ 118 | -------------------------------------------------------------------------------- /tests/util/macro.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTS_UTIL_MACRO_H_ 2 | #define TESTS_UTIL_MACRO_H_ 3 | 4 | 5 | #if defined(__GNUC__) && __GNUC__ >= 4 6 | #ifndef UNLIKELY 7 | #define LIKELY(x) (__builtin_expect((x), 1)) 8 | #define UNLIKELY(x) (__builtin_expect((x), 0)) 9 | #endif /* UNLIKELY */ 10 | #else /* defined(__GNUC__) && __GNUC__ >= 4 */ 11 | #ifndef UNLIKELY 12 | #define LIKELY(x) (x) 13 | #define UNLIKELY(x) (x) 14 | #endif /* UNLIKELY */ 15 | #endif /* defined(__GNUC__) && __GNUC__ >= 4 */ 16 | 17 | #ifndef UNUSED 18 | #define UNUSED(x) (void) x 19 | #endif 20 | 21 | #define EXPAND(_x) _x 22 | #define QUOTE(_q) #_q 23 | #define STRINGIZE(_q) #_q 24 | 25 | #ifndef CONCATENATE 26 | #define CONCATENATE( s1, s2 ) s1 ## s2 27 | #define EXPAND_THEN_CONCATENATE( s1, s2 ) CONCATENATE( s1, s2 ) 28 | #endif /* CONCATENATE */ 29 | 30 | #define AS_SINGLE_ARGUMENT(...) __VA_ARGS__ 31 | 32 | /** 33 | * This macro expands into a different identifier in every expansion. 34 | * Note that you _can_ clash with an invocation of UNIQUE_IDENTIFIER 35 | * by manually using the same identifier elsewhere; or by carefully 36 | * choosing another prefix etc. 37 | */ 38 | #ifdef __COUNTER__ 39 | #define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __COUNTER__) 40 | #else 41 | #define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __LINE__) 42 | #endif /* COUNTER */ 43 | 44 | #define COUNT_THIS_LINE static_assert(__COUNTER__ + 1, ""); 45 | #define START_COUNTING_LINES(count_name) enum { EXPAND_THEN_CONCATENATE(count_name,_start) = __COUNTER__ }; 46 | #define FINISH_COUNTING_LINES(count_name) enum { count_name = __COUNTER__ - EXPAND_THEN_CONCATENATE(count_name,_start) - 1 }; 47 | 48 | 49 | ///** 50 | // * This macro expands into a different identifier in every expansion. 51 | // * Note that you _can_ clash with an invocation of UNIQUE_IDENTIFIER 52 | // * by manually using the same identifier elsewhere; or by carefully 53 | // * choosing another prefix etc. 54 | // */ 55 | //#ifdef __COUNTER__ 56 | //#define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __COUNTER__) 57 | //#else 58 | //#define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __LINE__) 59 | //#endif /* COUNTER */ 60 | 61 | 62 | /** 63 | * Map macro - applying an arbitrary macro to multiple arguments; 64 | * based on the discussion and William Swanson's suggestion here: 65 | * http://stackoverflow.com/q/6707148/1593077 66 | * 67 | * Usage example: 68 | * 69 | * #define DO_SOMETHING(x) char const *x##_string = #x; 70 | * MAP(DO_SOMETHING, foo, bar, baz) 71 | * 72 | * will expand to 73 | * 74 | * char const *foo_string = "foo"; 75 | * char const *bar_string = "bar"; 76 | * char const *baz_string = "baz"; 77 | * 78 | */ 79 | 80 | #define EVAL0(...) __VA_ARGS__ 81 | #define EVAL1(...) EVAL0 (EVAL0 (EVAL0 (__VA_ARGS__))) 82 | #define EVAL2(...) EVAL1 (EVAL1 (EVAL1 (__VA_ARGS__))) 83 | #define EVAL3(...) EVAL2 (EVAL2 (EVAL2 (__VA_ARGS__))) 84 | #define EVAL4(...) EVAL3 (EVAL3 (EVAL3 (__VA_ARGS__))) 85 | #define EVAL(...) EVAL4 (EVAL4 (EVAL4 (__VA_ARGS__))) 86 | 87 | #define MAP_END(...) 88 | #define MAP_OUT 89 | 90 | #define MAP_GET_END() 0, MAP_END 91 | #define MAP_NEXT0(test, next, ...) next MAP_OUT 92 | #define MAP_NEXT1(test, next) MAP_NEXT0 (test, next, 0) 93 | #define MAP_NEXT(test, next) MAP_NEXT1 (MAP_GET_END test, next) 94 | 95 | /** 96 | * Use the third of these macros to apply a unary macro to all other arguments 97 | * passed, e.g. 98 | * 99 | * #define MY_UNARY(x) call_foo(x, 123) 100 | * MAP(MY_UNARY, 456, 789); 101 | * 102 | * will expand to 103 | * 104 | * call_foo(456, 123); 105 | * call_foo(789, 123); 106 | * 107 | */ 108 | #define MAP0(f, x, peek, ...) f(x) MAP_NEXT (peek, MAP1) (f, peek, __VA_ARGS__) 109 | #define MAP1(f, x, peek, ...) f(x) MAP_NEXT (peek, MAP0) (f, peek, __VA_ARGS__) 110 | #define MAP(f, ...) EVAL (MAP1 (f, __VA_ARGS__, (), 0)) 111 | 112 | /** 113 | * Same as MAP/MAP1/MAP0, but used for macros with pairs of arguments, and 114 | * specifying the first one 115 | */ 116 | #define MAP_BINARY0(f, fixed_arg, x, peek, ...) f(fixed_arg, x) MAP_NEXT (peek, MAP_BINARY1) (f, fixed_arg, peek, __VA_ARGS__) 117 | #define MAP_BINARY1(f, fixed_arg, x, peek, ...) f(fixed_arg, x) MAP_NEXT (peek, MAP_BINARY0) (f, fixed_arg, peek, __VA_ARGS__) 118 | #define MAP_BINARY(f, fixed_arg, ...) EVAL (MAP_BINARY1 (f, fixed_arg, __VA_ARGS__, (), 0)) 119 | 120 | /** 121 | * Same as MAP/MAP1/MAP0, but used for macros with triplets of arguments, and 122 | * specifying the first and second ones 123 | */ 124 | #define MAP_TRINARY0(f, first_fixed_arg, second_fixed_arg, x, peek, ...) f(first_fixed_arg, second_fixed_arg, x) MAP_NEXT (peek, MAP_TRINARY1) (f, first_fixed_arg, second_fixed_arg, peek, __VA_ARGS__) 125 | #define MAP_TRINARY1(f, first_fixed_arg, second_fixed_arg, x, peek, ...) f(first_fixed_arg, second_fixed_arg, x) MAP_NEXT (peek, MAP_TRINARY0) (f, first_fixed_arg, second_fixed_arg, peek, __VA_ARGS__) 126 | #define MAP_TRINARY(f, first_fixed_arg, second_fixed_arg, ...) EVAL (MAP_TRINARY1 (f, first_fixed_arg, second_fixed_arg, __VA_ARGS__, (), 0)) 127 | 128 | /** 129 | * Compile a different piece of code based on compile-time evaluation of a condition; 130 | * the condition must evaluate to 1 or to 0, exactly, or this will fail. 131 | * 132 | * Usage: 133 | * 134 | * IF_ELSE( GCC_VERSION > 4 )(code in case condition holds)(code in case condition fails) 135 | */ 136 | 137 | #define IF_ELSE(condition) _IF_ ## condition 138 | #define _IF_1(...) __VA_ARGS__ _IF_1_ELSE 139 | #define _IF_0(...) _IF_0_ELSE 140 | 141 | #define _IF_1_ELSE(...) 142 | #define _IF_0_ELSE(...) __VA_ARGS__ 143 | 144 | /** 145 | * Use this macro to instantiate tests for all integer types. 146 | */ 147 | #define INTEGER_TYPES \ 148 | char, short, int, long, long long, \ 149 | unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long 150 | 151 | // These: 152 | // 153 | // signed char, signed short, signed int, signed long, signed long long 154 | // 155 | // are the same as: 156 | // 157 | // char, short, int, long, long long 158 | // 159 | // and these should be covered by the native types: 160 | // 161 | // int8_t, int16_t, int32_t, int64_t, 162 | // uint8_t, uint16_t, uint32_t, uint64_t 163 | // 164 | // so the above should be sufficient 165 | 166 | 167 | 168 | /** 169 | * Use this macro to instantiate tests for all floating-point types. 170 | */ 171 | #define FLOAT_TYPES float, double 172 | 173 | #define ARRAY_TYPES_BY_SIZE \ 174 | kat::array, \ 175 | kat::array, \ 176 | kat::array, \ 177 | kat::array, \ 178 | kat::array, \ 179 | kat::array, \ 180 | kat::array, \ 181 | kat::array, \ 182 | kat::array, \ 183 | kat::array 184 | 185 | #define debug_print(x) do { std::cout << STRINGIZE(x) << " = " << x << std::endl; } while(0); 186 | 187 | 188 | 189 | #endif // TESTS_UTIL_MACRO_H_ 190 | -------------------------------------------------------------------------------- /tests/util/miscellany.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_KAT_TEST_MISC_UTILITIES_CUH_ 2 | #define CUDA_KAT_TEST_MISC_UTILITIES_CUH_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using fake_bool = int8_t; // so as not to have trouble with vector 14 | static_assert(sizeof(bool) == sizeof(fake_bool), "unexpected size mismatch"); 15 | 16 | 17 | template 18 | constexpr inline I round_up(I x, I quantum) { return (x % quantum) ? (x + (quantum - (x % quantum))) : x; } 19 | 20 | template 21 | constexpr inline I round_down(I x, I quantum) { return x - x % quantum; } 22 | 23 | template 24 | constexpr inline std::size_t array_length(const T(&ref)[Length]) { return Length; } 25 | 26 | // Should be constexpr - but only beginning in C++20 27 | template< class InputIt> 28 | bool inline all_of( InputIt first, InputIt last) 29 | { 30 | static_assert(std::is_same::value_type, bool>::value, "This function is intended for boolean-valued sequences only"); 31 | return std::all_of(first, last, [](bool b) { return b; }); 32 | } 33 | 34 | // Should be constexpr - but only beginning in C++20 35 | template 36 | bool all_of(const Container& c) 37 | { 38 | static_assert(std::is_same::value, "This function is intended for boolean-valued sequences only"); 39 | return std::all_of(std::cbegin(c), std::cend(c), [](bool b) { return b; }); 40 | } 41 | 42 | // Code for is_iterator lifted from: 43 | // https://stackoverflow.com/a/12032923/1593077 44 | template 45 | struct is_iterator 46 | { 47 | static constexpr bool value = false; 48 | }; 49 | 50 | template 51 | struct is_iterator::value_type, void>::value>::type> 52 | { 53 | static constexpr bool value = true; 54 | }; 55 | 56 | /** 57 | * Use these next few types to make assertions regarding each member 58 | * of a template parameter pack, e.g. 59 | * 60 | * static_assert(all_true<(Numbers == 0 || Numbers == 1)...>::value, ""); 61 | * 62 | */ 63 | template struct bool_pack; 64 | template 65 | using all_true = std::is_same, bool_pack>; 66 | 67 | template 68 | constexpr inline std::size_t size_in_bits() { return sizeof(T) * CHAR_BIT; } 69 | template 70 | constexpr inline std::size_t size_in_bits(const T&) { return sizeof(T) * CHAR_BIT; } 71 | 72 | 73 | /** 74 | * Divides the left-hand-side by the right-hand-side, rounding up 75 | * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. 76 | * 77 | * @param dividend the number to divide 78 | * @param divisor the number of by which to divide 79 | * @return The least integer multiple of {@link divisor} which is greater-or-equal to 80 | * the non-integral division dividend/divisor. 81 | * 82 | * @note sensitive to overflow, i.e. if dividend > std::numeric_limits::max() - divisor, 83 | * the result will be incorrect 84 | */ 85 | template 86 | constexpr inline S div_rounding_up(const S& dividend, const T& divisor) { 87 | return (dividend + divisor - 1) / divisor; 88 | /* 89 | std::div_t div_result = std::div(dividend, divisor); 90 | return div_result.quot + !(!div_result.rem); 91 | */ 92 | } 93 | 94 | // C++14 version of [[maybe_unused]] ... 95 | template 96 | inline void ignore(T &&) { } 97 | 98 | namespace doctest { 99 | 100 | const char* current_test_name() { return doctest::detail::g_cs->currentTest->m_name; } 101 | 102 | } // namespace doctest 103 | 104 | // #ifdef __GNUC__ 105 | template 106 | [[gnu::warning("Artificial warning to print a type name - please ignore")]] 107 | inline void print_type() noexcept { return; } 108 | 109 | template 110 | [[gnu::warning("Artificial warning to print a type name - please ignore")]] 111 | inline void print_type_of(T&& x) noexcept{ return; } 112 | // #endif 113 | 114 | namespace kernels { 115 | 116 | template 117 | __global__ void fill(T* buffer, T value, Size length) 118 | { 119 | // essentially, grid-level fill 120 | Size num_grid_threads = blockDim.x * gridDim.x; 121 | for(Size pos = threadIdx.x + blockIdx.x * blockDim.x; 122 | pos < length; 123 | pos += num_grid_threads) 124 | { 125 | buffer[pos] = value; 126 | } 127 | } 128 | 129 | } 130 | 131 | cuda::launch_configuration_t 132 | make_busy_config(cuda::device_t& device) { 133 | auto prop = device.properties(); 134 | auto sm_busy_factor = 2; 135 | auto num_blocks = prop.multiProcessorCount * sm_busy_factor; 136 | auto block_busy_factor = 4; // probably not the right number 137 | auto num_threads_per_block = cuda::warp_size * block_busy_factor; 138 | return cuda::make_launch_config(num_blocks, num_threads_per_block); 139 | } 140 | 141 | inline constexpr cuda::launch_configuration_t single_thread_launch_config() noexcept 142 | { 143 | return { cuda::grid::dimensions_t::point(), cuda::grid::dimensions_t::point() }; 144 | } 145 | 146 | // Poor man's addressof 147 | template 148 | T* addressof(T& arg) 149 | { 150 | return reinterpret_cast(&const_cast(reinterpret_cast(arg))); 151 | } 152 | 153 | 154 | 155 | #endif /* CUDA_KAT_TEST_MISC_UTILITIES_CUH_ */ 156 | -------------------------------------------------------------------------------- /tests/util/poor_mans_constexpr_string.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_KAT_TEST_POOR_MANS_CONSTEXPR_STRING_HPP_ 2 | #define CUDA_KAT_TEST_POOR_MANS_CONSTEXPR_STRING_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | ///@cond 10 | #include 11 | ///@endcond 12 | 13 | # if __cplusplus < 201103 14 | # error "C++11 or later required" 15 | # elif __cplusplus < 201402 16 | # define CONSTEXPR14_TN 17 | # else 18 | # define CONSTEXPR14_TN constexpr 19 | # endif 20 | 21 | namespace util { 22 | 23 | class constexpr_string 24 | { 25 | const char* const p_; 26 | const std::size_t sz_; 27 | 28 | public: 29 | typedef const char* const_iterator; 30 | 31 | template 32 | constexpr KAT_FHD constexpr_string(const char(&a)[N]) noexcept 33 | : p_(a) 34 | , sz_(N-1) 35 | {} 36 | 37 | constexpr KAT_FHD constexpr_string(const char* p, std::size_t N) noexcept 38 | : p_(p) 39 | , sz_(N) 40 | {} 41 | 42 | constexpr KAT_FHD const char* data() const noexcept {return p_;} 43 | constexpr KAT_FHD std::size_t size() const noexcept {return sz_;} 44 | 45 | constexpr KAT_FHD const_iterator begin() const noexcept {return p_;} 46 | constexpr KAT_FHD const_iterator end() const noexcept {return p_ + sz_;} 47 | 48 | constexpr KAT_FHD char operator[](std::size_t n) const 49 | { 50 | return n < sz_ ? p_[n] : 51 | #ifdef __CUDA_ARCH__ 52 | 0; 53 | #else 54 | throw std::out_of_range("constexpr_string"); 55 | #endif 56 | } 57 | }; 58 | 59 | KAT_FHD 60 | std::ostream& 61 | operator<<(std::ostream& os, constexpr_string const& s) 62 | { 63 | return os.write(s.data(), s.size()); 64 | } 65 | 66 | } // namespace util 67 | #include 68 | 69 | #endif // CUDA_KAT_TEST_POOR_MANS_CONSTEXPR_STRING_HPP_ 70 | -------------------------------------------------------------------------------- /tests/util/printing.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_KAT_TEST_UTILS_PRINTING_HPP_ 2 | #define CUDA_KAT_TEST_UTILS_PRINTING_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace detail { 9 | template 10 | struct promoted_for_streaming { using type = ToBeStreamed; }; 11 | template<> struct promoted_for_streaming{ using type = short; }; 12 | template<> struct promoted_for_streaming{ using type = signed short; }; 13 | template<> struct promoted_for_streaming { using type = unsigned short; }; 14 | 15 | } // namespace detail 16 | /* 17 | * The following structs are used for streaming data to iostreams streams. 18 | * They have a tendency to try to outsmart you, e.g. w.r.t. char or unsigned 19 | * char data - they assume you're really passing ISO-8859-1 code points 20 | * rather than integral values, and will print accordingly. Using this 21 | * generic promoter, you van avoid that. 22 | */ 23 | template 24 | typename detail::promoted_for_streaming::type promote_for_streaming(const ToBeStreamed& tbs) 25 | { 26 | return static_cast::type>(tbs); 27 | } 28 | 29 | inline const char* ordinal_suffix(int n) 30 | { 31 | static const char suffixes [4][5] = {"th", "st", "nd", "rd"}; 32 | auto ord = n % 100; 33 | if (ord / 10 == 1) { ord = 0; } 34 | ord = ord % 10; 35 | return suffixes[ord > 3 ? 0 : ord]; 36 | } 37 | 38 | // cuda-api-wrappers-related utilities 39 | 40 | template 41 | inline std::string xth(N n) { return std::to_string(n) + ordinal_suffix(n); } 42 | 43 | std::ostream& operator<<(std::ostream& os, cuda::grid::dimensions_t dims) 44 | { 45 | return os << '(' << dims.x << "," << dims.y << "," << dims.z << ')'; 46 | } 47 | 48 | std::ostream& operator<<(std::ostream& os, cuda::launch_configuration_t lc) 49 | { 50 | return os 51 | << "grid x block dimensions = " << lc.grid_dimensions << " x " << lc.block_dimensions << ", " 52 | << lc.dynamic_shared_memory_size << " bytes dynamic shared memory" << '\n'; 53 | } 54 | 55 | #ifdef __SIZEOF_INT128__ 56 | 57 | // always in hex! 58 | 59 | std::ostream& operator<<(std::ostream& os, __uint128_t x) 60 | { 61 | return os << "uint128_t{" << uint64_t(x >> 64) << uint64_t(x & ~uint64_t{0}) << '}'; 62 | } 63 | 64 | std::ostream& operator<<(std::ostream& os, __int128_t x) 65 | { 66 | auto sign = x < 0 ? '-' : ' '; 67 | auto magnitude = x < 0 ? -x : x ; 68 | return os << "int128_t{" << sign << uint64_t(magnitude >> 64) << uint64_t(magnitude & ~uint64_t{0}) << '}'; 69 | } 70 | #endif 71 | 72 | 73 | #endif // CUDA_KAT_TEST_UTILS_PRINTING_HPP_ 74 | -------------------------------------------------------------------------------- /tests/util/random.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "random.hpp" 3 | 4 | namespace util { 5 | namespace random { 6 | std::random_device device; // Note this is a callable object. 7 | std::default_random_engine engine(device()); 8 | } // namespace random 9 | } // namespace util 10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/util/random.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef CUDA_KAT_TEST_UTILITIES_RANDOM_H_ 3 | #define CUDA_KAT_TEST_UTILITIES_RANDOM_H_ 4 | 5 | /************************************************************ 6 | * 7 | * Simplistic and non-thread-safe random number generation 8 | * convenience utility - based on the C++ standard library. 9 | * 10 | * If you need to do something serious with random numbers, 11 | * dont use this; if you just want a bunch of random-looking 12 | * numbers quick & dirty, do use it. 13 | * 14 | ************************************************************/ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace util { 23 | 24 | namespace random { 25 | 26 | 27 | extern std::random_device device; // Note this is a callable object. 28 | extern std::default_random_engine engine; 29 | 30 | using result_t = decltype(engine)::result_type; 31 | using seed_t = result_t; 32 | 33 | template 34 | using uniform_distribution = std::conditional_t< 35 | std::is_floating_point::value, 36 | std::uniform_real_distribution, 37 | std::uniform_int_distribution 38 | >; 39 | 40 | 41 | 42 | /* 43 | // TODO: Does the distribution object actually remain constant? I wonder. 44 | // Should I return an rvalue reference? 45 | template 46 | inline typename Distribution::result_type sample_from(Distribution& distribution) { 47 | return distribution(engine); 48 | } 49 | */ 50 | 51 | template 52 | inline typename Distribution::result_type sample_from( 53 | Distribution& distribution, 54 | Engine& engine = util::random::engine) 55 | { 56 | return distribution(engine); 57 | } 58 | 59 | inline void seed(const seed_t& seed_value) 60 | { 61 | engine.seed(seed_value); 62 | } 63 | 64 | /* In your code, do something like: 65 | 66 | const int rangeMin = 1; 67 | const int rangeMax = 10; 68 | std::uniform_int_distribution distribution(rangeMin, rangeMax); 69 | // util::random::seed(std::time(0)); // seed with the current time 70 | auto a = util::random::sample_from(distribution); 71 | cout << "A random integer between " << rangeMin << "and " << " for you: " 72 | << util::random::sample_from(distribution) << '\n'; 73 | 74 | */ 75 | 76 | // Some more examples of distributions: 77 | //std::uniform_int_distribution uint_dist; // by default range [0, MAX] 78 | //std::uniform_int_distribution uint_dist10(0,10); // range [0,10] 79 | //std::normal_distribution normal_dist(mean, stddeviation); // N(mean, stddeviation) 80 | 81 | template 82 | constexpr inline void generate( 83 | ForwardIt first, 84 | ForwardIt last, 85 | Distribution& distribution, 86 | Engine& engine = util::random::engine) 87 | { 88 | // If we could rely on having C++17, we could generate in parallel... 89 | std::generate(first, last, [&distribution, &engine]() { 90 | return static_cast::value_type>(sample_from(distribution, engine)); 91 | }); 92 | } 93 | 94 | template 95 | constexpr inline void generate_n( 96 | ForwardIt first, 97 | Size count, 98 | Distribution& distribution, 99 | Engine& engine = util::random::engine) 100 | { 101 | // static_assert(is_iterator::value == true, "The 'first' parameter is not of an iterator type"); 102 | // If we could rely on having C++17, we could generate in parallel... 103 | return generate(first, first + count, distribution, engine); 104 | } 105 | 106 | template 107 | constexpr inline void insertion_generate_n( 108 | Inserter inserter, 109 | Size count, 110 | Distribution& distribution, 111 | Engine& engine = util::random::engine) 112 | { 113 | for(size_t i = 0; i < count; i++) { 114 | *(inserter++) = sample_from(distribution, engine); 115 | } 116 | } 117 | 118 | template 119 | constexpr inline void insertion_generate_n( 120 | Inserter inserter, 121 | Size count, 122 | Distribution&& distribution, 123 | Engine& engine = util::random::engine) 124 | { 125 | insertion_generate_n(inserter, count, distribution, engine); 126 | } 127 | 128 | template 129 | constexpr inline std::unordered_set::value_type> 130 | sample_subset( 131 | RandomAccessIterator begin, 132 | RandomAccessIterator end, 133 | Size subset_size, 134 | Engine& engine = util::random::engine) 135 | { 136 | std::unordered_set::value_type> sampled_subset{}; 137 | std::uniform_int_distribution distribution {0, (end - begin) - 1}; 138 | while(sampled_subset.size() < subset_size) { 139 | auto sampled_element_index = util::random::sample_from(distribution, engine); 140 | sampled_subset.insert(*(begin + sampled_element_index)); 141 | } 142 | return sampled_subset; 143 | } 144 | 145 | template 146 | constexpr inline std::unordered_set::value_type> 147 | sample_subset( 148 | RandomAccessIterator begin, 149 | Size domain_length, 150 | Size subset_size, 151 | Engine& engine = util::random::engine) 152 | { 153 | if (domain_length < subset_size) { throw std::invalid_argument("Can't sample a subset larger than the domain"); } 154 | std::unordered_set::value_type> sampled_subset{}; 155 | if (domain_length == 0) { 156 | if (subset_size == 0) { return sampled_subset; } 157 | throw std::invalid_argument("Can't sample a subset larger than the domain"); 158 | } 159 | std::uniform_int_distribution distribution {0, domain_length - 1}; 160 | // TODO: If we need to sample more than half the domain, sample the elements _outside_ the set instead. 161 | while(sampled_subset.size() < subset_size) { 162 | auto sampled_element_index = util::random::sample_from(distribution, engine); 163 | sampled_subset.insert(*(begin + sampled_element_index)); 164 | } 165 | return sampled_subset; 166 | } 167 | 168 | template 169 | constexpr inline std::unordered_set 170 | sample_index_subset( 171 | Size domain_length, 172 | Size subset_size, 173 | Engine& engine = util::random::engine) 174 | { 175 | if (domain_length < subset_size) { throw std::invalid_argument("Can't sample a subset larger than the domain"); } 176 | std::unordered_set sampled_subset{}; 177 | if (domain_length == 0) { 178 | if (subset_size == 0) { return sampled_subset; } 179 | throw std::invalid_argument("Can't sample a subset larger than the domain"); 180 | } 181 | std::uniform_int_distribution distribution {0, domain_length - 1}; 182 | // TODO: If we need to sample more than half the domain, sample the elements _outside_ the set instead. 183 | while(sampled_subset.size() < subset_size) { 184 | sampled_subset.insert(util::random::sample_from(distribution, engine)); 185 | } 186 | return sampled_subset; 187 | } 188 | 189 | } // namespace random 190 | } // namespace util 191 | 192 | #endif /* CUDA_KAT_TEST_UTILITIES_RANDOM_H_ */ 193 | 194 | -------------------------------------------------------------------------------- /tests/util/type_name.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef UTIL_TYPE_NAME_HPP_ 3 | #define UTIL_TYPE_NAME_HPP_ 4 | 5 | #include "poor_mans_constexpr_string.hpp" 6 | #include 7 | #include 8 | #include 9 | #ifndef _MSC_VER 10 | #include 11 | #endif 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | ///@cond 18 | #include 19 | ///@endcond 20 | 21 | namespace util { 22 | 23 | template 24 | CONSTEXPR14_TN KAT_HD constexpr_string type_name() 25 | { 26 | #ifdef __clang__ 27 | constexpr_string p = __PRETTY_FUNCTION__; 28 | return constexpr_string(p.data() + 31, p.size() - 31 - 1); 29 | //#elif defined(__CUDA_ARCH__) 30 | // constexpr_string p = __PRETTY_FUNCTION__; 31 | // return constexpr_string(p.data(), p.size()); 32 | #elif defined(__CUDACC__) 33 | constexpr_string p = __PRETTY_FUNCTION__; 34 | # if __cplusplus < 201402 || defined(__CUDA_ARCH__) 35 | return constexpr_string(p.data() + 51, p.size() - 51 - 1); // 50 is the length of util::constexpr_string util::type_name() [with T = 36 | # else 37 | return constexpr_string(p.data() + 61, p.size() - 61 - 1); // 50 is the length of constexpr util::constexpr_string util::type_name() [with T = 38 | # endif 39 | #elif defined(__GNUC__) 40 | constexpr_string p = __PRETTY_FUNCTION__; 41 | # if __cplusplus < 201402 42 | return constexpr_string(p.data() + 36, p.size() - 36 - 1); 43 | # else 44 | return constexpr_string(p.data() + 46, p.size() - 46 - 1); 45 | # endif 46 | #elif defined(_MSC_VER) 47 | constexpr_string p = __FUNCSIG__; 48 | return constexpr_string(p.data() + 38, p.size() - 38 - 7); 49 | #endif 50 | } 51 | 52 | /*template 53 | CONSTEXPR14_TN KAT_FHD constexpr_string type_name(T&&) 54 | { 55 | return type_name(); 56 | } 57 | 58 | template 59 | CONSTEXPR14_TN KAT_FHD constexpr_string type_name(const T&) 60 | { 61 | return type_name(); 62 | }*/ 63 | 64 | 65 | /** 66 | * A function for obtaining the string name 67 | * of a type, using that actual type at compile-time. 68 | * (The function might have been constexpr, but I doubt 69 | * so much is acceptable at compile time.) This is an 70 | * alternative to using type_info.name() which also 71 | * preserves CV qualifiers (const, volatile, reference, 72 | * rvalue-reference) 73 | * 74 | * The code was copied from this StackOverflow answer: 75 | * http://stackoverflow.com/a/20170989/1593077 76 | * due to Howard Hinnant 77 | * ... with some slight modifications by Eyal Rozenberg 78 | */ 79 | 80 | 81 | template 82 | std::string type_name_() 83 | { 84 | typedef typename std::remove_reference::type TR; 85 | 86 | std::unique_ptr own( 87 | #ifndef _MSC_VER 88 | abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr), 89 | #else 90 | nullptr, 91 | #endif 92 | std::free 93 | ); 94 | std::string r = (own != nullptr) ? own.get() : typeid(TR).name(); 95 | if (WithCVCorrections) { 96 | if (std::is_const::value) 97 | r += " const"; 98 | if (std::is_volatile::value) 99 | r += " volatile"; 100 | if (std::is_lvalue_reference::value) 101 | r += "&"; 102 | else if (std::is_rvalue_reference::value) 103 | r += "&&"; 104 | } 105 | return r; 106 | } 107 | 108 | /** 109 | * This is a convenience function, so that instead of 110 | * 111 | * util::type_name() 112 | * 113 | * you could use: 114 | * 115 | * util::type_name_of(my_value) 116 | * 117 | * @param v a value which is only passed to indicate a type 118 | * @return the string type name of typeof(v) 119 | */ 120 | template 121 | std::string type_name_of(const T& v) { return util::type_name(); } 122 | 123 | 124 | template 125 | auto type_names_() -> decltype(std::make_tuple(type_name_()...)) 126 | { return std::make_tuple(type_name_()...); } 127 | 128 | 129 | /** 130 | * Removed the trailing template parameter listing from a type name, e.g. 131 | * 132 | * foo bar> 133 | * 134 | * becomes 135 | * 136 | * foo bar> 137 | * 138 | * This is not such useful function, as int bar(double x) will 139 | * become int bar. So - fix it. 140 | * 141 | * @param type_name the name of a type, preferably obtained with 142 | * util::type_info 143 | * @return the template-less type name, or the original type name if 144 | * we could not find anything to remove (doesn't throw) 145 | */ 146 | inline std::string discard_template_parameters(const std::string& type_name) 147 | { 148 | auto template_rbracket_pos = type_name.rfind('>'); 149 | if (template_rbracket_pos == std::string::npos) { 150 | return type_name; 151 | } 152 | unsigned bracket_depth = 1; 153 | for (unsigned pos = template_rbracket_pos; pos > 0; pos++) { 154 | switch(type_name[pos]) { 155 | case '>': bracket_depth++; break; 156 | case '<': bracket_depth--; break; 157 | } 158 | if (bracket_depth == 0) return type_name.substr(0,pos); 159 | } 160 | return type_name; 161 | } 162 | 163 | } /* namespace util */ 164 | 165 | #include 166 | 167 | #endif /* UTIL_TYPE_NAME_HPP_ */ 168 | -------------------------------------------------------------------------------- /tests/util/woodruff_int128_t.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | struct int128_t final 6 | { 7 | int128_t() = default; 8 | constexpr int128_t(const int64_t high_, const uint64_t low_) : high(high_), low(low_) {} 9 | constexpr int128_t(const int64_t v) : high(v < 0 ? 0xfffffffffffffffflu : 0), low(v) {} 10 | 11 | explicit constexpr operator int64_t() const { return static_cast(low); } 12 | 13 | constexpr operator bool() const { return low || high; } 14 | 15 | int64_t high; 16 | uint64_t low; 17 | }; 18 | 19 | inline constexpr bool operator<(const int128_t l, const int128_t r) 20 | { 21 | return l.high < r.high || (l.high == r.high && l.low < r.low); 22 | } 23 | 24 | inline constexpr bool operator<=(const int128_t l, const int128_t r) 25 | { 26 | return l.high < r.high || (l.high == r.high && l.low <= r.low); 27 | } 28 | 29 | inline constexpr bool operator>(const int128_t l, const int128_t r) 30 | { 31 | return l.high > r.high || (l.high == r.high && l.low > r.low); 32 | } 33 | 34 | inline constexpr bool operator>=(const int128_t l, const int128_t r) 35 | { 36 | return l.high > r.high || (l.high == r.high && l.low >= r.low); 37 | } 38 | 39 | inline constexpr bool operator==(const int128_t l, const int128_t r) 40 | { 41 | return l.low == r.low && l.high == r.high; 42 | } 43 | 44 | inline constexpr bool operator!=(const int128_t l, const int128_t r) 45 | { 46 | return l.low != r.low && l.high != r.high; 47 | } 48 | 49 | inline constexpr int128_t operator+(const int128_t l, const int128_t r) 50 | { 51 | int128_t result{l.high + r.high, l.low + r.low}; 52 | if (result.low < l.low) 53 | { 54 | ++result.high; 55 | } 56 | return result; 57 | } 58 | 59 | inline constexpr int128_t operator-(const int128_t l, const int128_t r) 60 | { 61 | int128_t result{l.high - r.high, l.low - r.low}; 62 | if (result.low > l.low) 63 | { 64 | --result.high; 65 | } 66 | return result; 67 | } 68 | 69 | inline constexpr int128_t operator*(const int128_t l, const int128_t r) 70 | { 71 | int128_t result{static_cast((l.low >> 32) * (r.low >> 32)), (l.low & 0xffffffff) * (r.low & 0xffffffff)}; 72 | { 73 | const uint64_t m12 = (l.low & 0xffffffff) * (r.low >> 32); 74 | { 75 | const uint64_t m12_l = (m12 & 0xffffffff) << 32; 76 | const uint64_t old_low = result.low; 77 | result.low += m12_l; 78 | if (result.low < old_low) 79 | { 80 | ++result.high; 81 | } 82 | result.high += (m12 >> 32); 83 | 84 | } 85 | } 86 | { 87 | const uint64_t m21 = (l.low >> 32) * (r.low & 0xffffffff); 88 | { 89 | const uint64_t m21_l = (m21 & 0xffffffff) << 32; 90 | const uint64_t old_low = result.low; 91 | result.low += m21_l; 92 | if (result.low < old_low) 93 | { 94 | ++result.high; 95 | } 96 | result.high += static_cast((m21 >> 32)); 97 | } 98 | } 99 | result.high += 100 | static_cast( 101 | (l.low & 0xffffffff) * (static_cast(r.high) & 0xffffffff) + 102 | (static_cast(l.high) & 0xffffffff) * (r.low & 0xffffffff) + 103 | (((l.low & 0xffffffff) * (static_cast(r.high) >> 32)) << 32) + 104 | (((static_cast(l.high) >> 32) * (static_cast(r.low) & 0xffffffff)) << 32) + 105 | (((l.low >> 32) * (static_cast(r.high) & 0xffffffff)) << 32) + 106 | (((static_cast(l.high) & 0xffffffff) * (r.low >> 32)) << 32)); 107 | 108 | return result; 109 | } 110 | 111 | /* 112 | inline constexpr int128_t operator/(const int128_t l, const int128_t r) 113 | { 114 | //! \todo implement 115 | return l; 116 | } 117 | 118 | inline constexpr int128_t operator%(const int128_t l, const int128_t r) 119 | { 120 | //! \todo implement 121 | return l; 122 | } 123 | */ 124 | 125 | inline constexpr int128_t operator~(const int128_t v) 126 | { 127 | return int128_t{~v.high, ~v.low}; 128 | } 129 | 130 | inline constexpr int128_t operator&(const int128_t l, const int128_t r) 131 | { 132 | return int128_t{l.high & r.high, l.low & r.low}; 133 | } 134 | 135 | inline constexpr int128_t operator|(const int128_t l, const int128_t r) 136 | { 137 | return int128_t{l.high | r.high, l.low | r.low}; 138 | } 139 | 140 | inline constexpr int128_t operator^(const int128_t l, const int128_t r) 141 | { 142 | return int128_t{l.high ^ r.high, l.low ^ r.low}; 143 | } 144 | 145 | inline constexpr int128_t operator<<(const int128_t v, const unsigned s) 146 | { 147 | if (s >= 64) 148 | { 149 | return {static_cast(v.low) << (s - 64), 0}; 150 | } 151 | return {v.high << s | static_cast(v.low >> (64 - s)), v.low << s}; 152 | } 153 | 154 | inline constexpr int128_t operator>>(const int128_t v, const unsigned s) 155 | { 156 | if (s >= 64) 157 | { 158 | return {v.high >> s, static_cast(v.high >> (s - 64))}; 159 | } 160 | return {v.high >> s, v.high << (64 - s) | v.low >> s}; 161 | } 162 | 163 | inline constexpr int128_t & operator++(int128_t & v) 164 | { 165 | ++v.low; 166 | if (!v.low) 167 | { 168 | ++v.high; 169 | } 170 | return v; 171 | } 172 | 173 | inline constexpr int128_t & operator--(int128_t & v) 174 | { 175 | if (!v.low) 176 | { 177 | --v.high; 178 | } 179 | --v.low; 180 | return v; 181 | } 182 | 183 | inline constexpr int128_t operator++(int128_t & v, int) 184 | { 185 | int128_t r = v; 186 | ++v; 187 | return r; 188 | } 189 | 190 | inline constexpr int128_t operator--(int128_t & v, int) 191 | { 192 | int128_t r = v; 193 | --v; 194 | return r; 195 | } 196 | 197 | inline constexpr int128_t operator-(const int128_t v) 198 | { 199 | int128_t result{~v.high, ~v.low}; 200 | ++result; 201 | return result; 202 | } 203 | 204 | inline constexpr int128_t & operator+=(int128_t & l, const int128_t r) 205 | { 206 | const uint64_t low = l.low; 207 | l.low += r.low; 208 | l.high += r.high; 209 | if (l.low < low) 210 | { 211 | ++l.high; 212 | } 213 | return l; 214 | } 215 | 216 | inline constexpr int128_t & operator-=(int128_t & l, const int128_t r) 217 | { 218 | const uint64_t low = l.low; 219 | l.low -= r.low; 220 | l.high -= r.high; 221 | if (l.low > low) 222 | { 223 | --l.high; 224 | } 225 | return l; 226 | } 227 | 228 | inline constexpr int128_t & operator*=(int128_t & l, const int128_t r) 229 | { 230 | l = l * r; 231 | return l; 232 | } 233 | 234 | /* 235 | inline constexpr int128_t & operator/=(int128_t & l, const int128_t r) 236 | { 237 | //! \todo implement 238 | return l; 239 | } 240 | 241 | inline constexpr int128_t & operator%=(int128_t & l, const int128_t r) 242 | { 243 | //! \todo implement 244 | return l; 245 | } 246 | */ 247 | 248 | inline constexpr int128_t & operator&=(int128_t & l, const int128_t r) 249 | { 250 | l.high &= r.high; 251 | l.low &= r.low; 252 | return l; 253 | } 254 | 255 | inline constexpr int128_t & operator|=(int128_t & l, const int128_t r) 256 | { 257 | l.high |= r.high; 258 | l.low |= r.low; 259 | return l; 260 | } 261 | 262 | inline constexpr int128_t & operator^=(int128_t & l, const int128_t r) 263 | { 264 | l.high ^= r.high; 265 | l.low ^= r.low; 266 | return l; 267 | } 268 | 269 | inline constexpr int128_t & operator<<=(int128_t & v, const unsigned s) 270 | { 271 | if (s >= 64) 272 | { 273 | v.high = v.low << (s - 64); 274 | v.low = 0; 275 | } 276 | else 277 | { 278 | v.high = v.high << s | v.low >> (64 - s); 279 | v.low <<= s; 280 | } 281 | return v; 282 | } 283 | 284 | inline constexpr int128_t & operator>>=(int128_t & v, const unsigned s) 285 | { 286 | if (s >= 64) 287 | { 288 | v.low = v.high >> (s - 64); 289 | } 290 | else 291 | { 292 | v.low = v.high << (64 - s) | v.low >> s; 293 | } 294 | v.high >>= s; 295 | return v; 296 | } 297 | -------------------------------------------------------------------------------- /tests/util/woodruff_uint128_t.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | struct uint128_t final 6 | { 7 | uint128_t() = default; 8 | constexpr uint128_t(const uint64_t high_, const uint64_t low_) : high(high_), low(low_) {} 9 | constexpr uint128_t(const uint64_t v) : high(0), low(v) {} 10 | 11 | explicit constexpr operator uint64_t() const { return low; } 12 | 13 | constexpr operator bool() const { return low || high; } 14 | 15 | uint64_t high; 16 | uint64_t low; 17 | }; 18 | 19 | inline constexpr bool operator<(const uint128_t l, const uint128_t r) 20 | { 21 | return l.high < r.high || (l.high == r.high && l.low < r.low); 22 | } 23 | 24 | inline constexpr bool operator<=(const uint128_t l, const uint128_t r) 25 | { 26 | return l.high < r.high || (l.high == r.high && l.low <= r.low); 27 | } 28 | 29 | inline constexpr bool operator>(const uint128_t l, const uint128_t r) 30 | { 31 | return l.high > r.high || (l.high == r.high && l.low > r.low); 32 | } 33 | 34 | inline constexpr bool operator>=(const uint128_t l, const uint128_t r) 35 | { 36 | return l.high > r.high || (l.high == r.high && l.low >= r.low); 37 | } 38 | 39 | inline constexpr bool operator==(const uint128_t l, const uint128_t r) 40 | { 41 | return l.low == r.low && l.high == r.high; 42 | } 43 | 44 | inline constexpr bool operator!=(const uint128_t l, const uint128_t r) 45 | { 46 | return l.low != r.low || l.high != r.high; 47 | } 48 | 49 | inline constexpr uint128_t operator+(const uint128_t l, const uint128_t r) 50 | { 51 | uint128_t result{l.high + r.high, l.low + r.low}; 52 | if (result.low < l.low) 53 | { 54 | ++result.high; 55 | } 56 | return result; 57 | } 58 | 59 | inline constexpr uint128_t operator-(const uint128_t l, const uint128_t r) 60 | { 61 | uint128_t result{l.high - r.high, l.low - r.low}; 62 | if (result.low > l.low) 63 | { 64 | --result.high; 65 | } 66 | return result; 67 | } 68 | 69 | inline constexpr uint128_t operator*(const uint128_t l, const uint128_t r) 70 | { 71 | uint128_t result{(l.low >> 32) * (r.low >> 32), (l.low & 0xffffffff) * (r.low & 0xffffffff)}; 72 | { 73 | const uint64_t m12 = (l.low & 0xffffffff) * (r.low >> 32); 74 | { 75 | const uint64_t m12_l = (m12 & 0xffffffff) << 32; 76 | const uint64_t old_low = result.low; 77 | result.low += m12_l; 78 | if (result.low < old_low) 79 | { 80 | ++result.high; 81 | } 82 | result.high += (m12 >> 32); 83 | 84 | } 85 | } 86 | { 87 | const uint64_t m21 = (l.low >> 32) * (r.low & 0xffffffff); 88 | { 89 | const uint64_t m21_l = (m21 & 0xffffffff) << 32; 90 | const uint64_t old_low = result.low; 91 | result.low += m21_l; 92 | if (result.low < old_low) 93 | { 94 | ++result.high; 95 | } 96 | result.high += (m21 >> 32); 97 | } 98 | } 99 | result.high += 100 | (l.low & 0xffffffff) * (r.high & 0xffffffff) + 101 | (l.high & 0xffffffff) * (r.low & 0xffffffff) + 102 | (((l.low & 0xffffffff) * (r.high >> 32)) << 32) + 103 | (((l.high >> 32) * (r.low & 0xffffffff)) << 32) + 104 | (((l.low >> 32) * (r.high & 0xffffffff)) << 32) + 105 | (((l.high & 0xffffffff) * (r.low >> 32)) << 32); 106 | 107 | return result; 108 | } 109 | 110 | /* 111 | inline constexpr uint128_t operator/(const uint128_t l, const uint128_t r) 112 | { 113 | //! \todo implement 114 | return {}; 115 | } 116 | 117 | inline constexpr uint128_t operator%(const uint128_t l, const uint128_t r) 118 | { 119 | //! \todo implement 120 | return {}; 121 | } 122 | */ 123 | 124 | inline constexpr uint128_t operator~(const uint128_t v) 125 | { 126 | return uint128_t{~v.high, ~v.low}; 127 | } 128 | 129 | inline constexpr uint128_t operator&(const uint128_t l, const uint128_t r) 130 | { 131 | return uint128_t{l.high & r.high, l.low & r.low}; 132 | } 133 | 134 | inline constexpr uint128_t operator|(const uint128_t l, const uint128_t r) 135 | { 136 | return uint128_t{l.high | r.high, l.low | r.low}; 137 | } 138 | 139 | inline constexpr uint128_t operator^(const uint128_t l, const uint128_t r) 140 | { 141 | return uint128_t{l.high ^ r.high, l.low ^ r.low}; 142 | } 143 | 144 | inline constexpr uint128_t operator<<(const uint128_t v, const unsigned s) 145 | { 146 | if (s >= 64) 147 | { 148 | return {v.low << (s - 64), 0}; 149 | } 150 | return {v.high << s | v.low >> (64 - s), v.low << s}; 151 | } 152 | 153 | inline constexpr uint128_t operator>>(const uint128_t v, const unsigned s) 154 | { 155 | if (s >= 64) 156 | { 157 | return {0, v.high >> (s - 64)}; 158 | } 159 | return {v.high >> s, v.high << (64 - s) | v.low >> s}; 160 | } 161 | 162 | inline constexpr uint128_t & operator++(uint128_t & v) 163 | { 164 | ++v.low; 165 | if (!v.low) 166 | { 167 | ++v.high; 168 | } 169 | return v; 170 | } 171 | 172 | inline constexpr uint128_t & operator--(uint128_t & v) 173 | { 174 | if (!v.low) 175 | { 176 | --v.high; 177 | } 178 | --v.low; 179 | return v; 180 | } 181 | 182 | inline constexpr uint128_t operator++(uint128_t & v, int) 183 | { 184 | uint128_t r = v; 185 | ++v; 186 | return r; 187 | } 188 | 189 | inline constexpr uint128_t operator--(uint128_t & v, int) 190 | { 191 | uint128_t r = v; 192 | --v; 193 | return r; 194 | } 195 | 196 | inline constexpr uint128_t & operator+=(uint128_t & l, const uint128_t r) 197 | { 198 | const uint64_t low = l.low; 199 | l.low += r.low; 200 | l.high += r.high; 201 | if (l.low < low) 202 | { 203 | ++l.high; 204 | } 205 | return l; 206 | } 207 | 208 | inline constexpr uint128_t & operator-=(uint128_t & l, const uint128_t r) 209 | { 210 | const uint64_t low = l.low; 211 | l.low -= r.low; 212 | l.high -= r.high; 213 | if (l.low > low) 214 | { 215 | --l.high; 216 | } 217 | return l; 218 | } 219 | 220 | inline constexpr uint128_t & operator*=(uint128_t & l, const uint128_t r) 221 | { 222 | l = l * r; 223 | return l; 224 | } 225 | 226 | /* 227 | inline constexpr uint128_t & operator/=(uint128_t & l, const uint128_t r) 228 | { 229 | //! \todo implement 230 | return l; 231 | } 232 | 233 | inline constexpr uint128_t & operator%=(uint128_t & l, const uint128_t r) 234 | { 235 | //! \todo implement 236 | return l; 237 | } 238 | */ 239 | 240 | inline constexpr uint128_t & operator&=(uint128_t & l, const uint128_t r) 241 | { 242 | l.high &= r.high; 243 | l.low &= r.low; 244 | return l; 245 | } 246 | 247 | inline constexpr uint128_t & operator|=(uint128_t & l, const uint128_t r) 248 | { 249 | l.high |= r.high; 250 | l.low |= r.low; 251 | return l; 252 | } 253 | 254 | inline constexpr uint128_t & operator^=(uint128_t & l, const uint128_t r) 255 | { 256 | l.high ^= r.high; 257 | l.low ^= r.low; 258 | return l; 259 | } 260 | 261 | inline constexpr uint128_t & operator<<=(uint128_t & v, const unsigned s) 262 | { 263 | if (s >= 64) 264 | { 265 | v.high = v.low << (s - 64); 266 | v.low = 0; 267 | } 268 | else 269 | { 270 | v.high = v.high << s | v.low >> (64 - s); 271 | v.low <<= s; 272 | } 273 | return v; 274 | } 275 | 276 | inline constexpr uint128_t & operator>>=(uint128_t & v, const unsigned s) 277 | { 278 | if (s >= 64) 279 | { 280 | v.low = v.high >> (s - 64); 281 | v.high = 0; 282 | } 283 | else 284 | { 285 | v.low = v.high << (64 - s) | v.low >> s; 286 | v.high >>= s; 287 | } 288 | return v; 289 | } 290 | --------------------------------------------------------------------------------