├── .codedocs
├── .github
    └── FUNDING.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    └── Modules
    │   ├── CompileWithWarnings.cmake
    │   ├── DocTest.cmake
    │   └── Findcuda-nvtx.cmake
├── docs
    └── cppreference-doxygen-web.tag.xml
├── doxygen.cfg
├── external
    └── doctest
    │   └── doctest.h
├── src
    ├── cuda-kat.cuh
    └── kat
    │   ├── common.hpp
    │   ├── containers
    │       ├── array.hpp
    │       ├── detail
    │       │   └── normal_iterator.hpp
    │       └── span.hpp
    │   ├── detail
    │       ├── constexpr_by_cpp_version.hpp
    │       ├── execution_space_specifiers.hpp
    │       ├── integer_sequence.hpp
    │       ├── pointers.cuh
    │       └── range_access.hpp
    │   ├── on_device
    │       ├── atomics.cuh
    │       ├── builtins.cuh
    │       ├── c_standard_library
    │       │   └── string.cuh
    │       ├── collaboration
    │       │   ├── block.cuh
    │       │   ├── grid.cuh
    │       │   └── warp.cuh
    │       ├── common.cuh
    │       ├── constexpr_math.cuh
    │       ├── detail
    │       │   ├── atomics.cuh
    │       │   ├── atomics
    │       │   │   └── missing_in_cuda.cuh
    │       │   ├── builtins.cuh
    │       │   ├── itoa.cuh
    │       │   └── shuffle.cuh
    │       ├── grid_info.cuh
    │       ├── math.cuh
    │       ├── miscellany.cuh
    │       ├── non-builtins.cuh
    │       ├── ptx.cuh
    │       ├── ptx
    │       │   ├── detail
    │       │   │   ├── define_macros.cuh
    │       │   │   └── undefine_macros.cuh
    │       │   ├── miscellany.cuh
    │       │   ├── special_registers.cuh
    │       │   └── video_instructions.cuh
    │       ├── sequence_ops
    │       │   ├── block.cuh
    │       │   ├── common.cuh
    │       │   ├── grid.cuh
    │       │   └── warp.cuh
    │       ├── shared_memory.cuh
    │       ├── shared_memory
    │       │   ├── basic.cuh
    │       │   └── operations.cuh
    │       ├── shuffle.cuh
    │       ├── streams
    │       │   ├── prefix_generators.cuh
    │       │   ├── printfing_ostream.cuh
    │       │   └── stringstream.cuh
    │       └── time.cuh
    │   ├── reference_wrapper.hpp
    │   ├── tuple.hpp
    │   └── utility.hpp
└── tests
    ├── CMakeLists.txt
    ├── array.cu
    ├── atomics.cu
    ├── block_collaboration.cu
    ├── builtins.cu
    ├── c_string.cu
    ├── common.cuh
    ├── constexpr_math.cu
    ├── grid_collaboration.cu
    ├── math.cu
    ├── miscellany.cu
    ├── printing.cu
    ├── sequence_ops.cu
    ├── shared_memory.cu
    ├── shuffle.cu
    ├── span.cu
    ├── time.cu
    ├── tuple.cu
    ├── util
        ├── cpu_builtin_equivalents.hpp
        ├── macro.h
        ├── miscellany.cuh
        ├── poor_mans_constexpr_string.hpp
        ├── prettyprint.hpp
        ├── printing.hpp
        ├── random.cu
        ├── random.hpp
        ├── type_name.hpp
        ├── woodruff_int128_t.hpp
        └── woodruff_uint128_t.hpp
    └── warp_collaboration.cu


/.codedocs:
--------------------------------------------------------------------------------
  1 | # CodeDocs.xyz Configuration File
  2 | #
  3 | # Rename this example to '.codedocs' and put it in the root directory of your
  4 | # repository. This file is optional, documentation will still be generated
  5 | # without it using sensible defaults.
  6 | 
  7 | #---------------------------------------------------------------------------
  8 | # CodeDocs Configuration
  9 | #---------------------------------------------------------------------------
 10 | 
 11 | # Include the Doxygen configuration from another file.
 12 | # The file must be a relative path with respect to the root of the repository.
 13 | # If any of the options in this doxyfile include a path (ie, INPUT), these
 14 | # paths will be considered relative to the root of the repository, not the
 15 | # location of the DOXYFILE.
 16 | 
 17 | DOXYFILE = doxygen.cfg
 18 | 
 19 | # Specify external repository to link documentation with.
 20 | # This is similar to Doxygen's TAGFILES option, but will automatically link to
 21 | # tags of other repositories already using CodeDocs. List each repository to
 22 | # link with by giving its location in the form of owner/repository.
 23 | # For example:
 24 | #   TAGLINKS = doxygen/doxygen CodeDocs/osg
 25 | # Note: these repositories must already be built on CodeDocs.
 26 | 
 27 | TAGLINKS =
 28 | 
 29 | #---------------------------------------------------------------------------
 30 | # Doxygen Configuration
 31 | #---------------------------------------------------------------------------
 32 | 
 33 | # Doxygen configuration may also be placed in this file.
 34 | # Currently, the following Doxygen configuration options are available. Refer
 35 | # to http://doxygen.org/manual/config.html for detailed explanation of the
 36 | # options. To request support for more options, contact support@codedocs.xyz.
 37 | #
 38 | # ABBREVIATE_BRIEF =
 39 | # ALIASES =
 40 | # ALLEXTERNALS =
 41 | # ALLOW_UNICODE_NAMES =
 42 | # ALPHABETICAL_INDEX =
 43 | # ALWAYS_DETAILED_SEC =
 44 | # AUTOLINK_SUPPORT =
 45 | # BRIEF_MEMBER_DESC =
 46 | # BUILTIN_STL_SUPPORT =
 47 | # CALLER_GRAPH =
 48 | # CALL_GRAPH =
 49 | # CASE_SENSE_NAMES =
 50 | # CITE_BIB_FILES =
 51 | # CLASS_DIAGRAMS =
 52 | # CLASS_GRAPH =
 53 | # COLLABORATION_GRAPH =
 54 | # COLS_IN_ALPHA_INDEX =
 55 | # CPP_CLI_SUPPORT =
 56 | # DIAFILE_DIRS =
 57 | # DIRECTORY_GRAPH =
 58 | # DISABLE_INDEX =
 59 | # DISTRIBUTE_GROUP_DOC =
 60 | # DOTFILE_DIRS =
 61 | # DOT_FONTNAME =
 62 | # DOT_FONTSIZE =
 63 | # DOT_GRAPH_MAX_NODES =
 64 | # DOT_IMAGE_FORMAT =
 65 | # DOT_TRANSPARENT =
 66 | # DOXYFILE_ENCODING =
 67 | # ENABLED_SECTIONS =
 68 | # ENABLE_PREPROCESSING =
 69 | # ENUM_VALUES_PER_LINE =
 70 | # EXAMPLE_PATH =
 71 | # EXAMPLE_PATTERNS =
 72 | # EXAMPLE_RECURSIVE =
 73 | # EXCLUDE =
 74 | # EXCLUDE_PATTERNS =
 75 | # EXCLUDE_SYMBOLS =
 76 | # EXPAND_AS_DEFINED =
 77 | # EXPAND_ONLY_PREDEF =
 78 | # EXTENSION_MAPPING =
 79 | # EXTERNAL_GROUPS =
 80 | # EXTERNAL_PAGES =
 81 | # EXTRACT_ALL =
 82 | # EXTRACT_ANON_NSPACES =
 83 | # EXTRACT_LOCAL_CLASSES =
 84 | # EXTRACT_LOCAL_METHODS =
 85 | # EXTRACT_PACKAGE =
 86 | # EXTRACT_PRIVATE =
 87 | # EXTRACT_STATIC =
 88 | # EXT_LINKS_IN_WINDOW =
 89 | # FILE_PATTERNS =
 90 | # FORCE_LOCAL_INCLUDES =
 91 | # FORMULA_FONTSIZE =
 92 | # FORMULA_TRANSPARENT =
 93 | # FULL_PATH_NAMES =
 94 | # GENERATE_BUGLIST =
 95 | # GENERATE_DEPRECATEDLIST =
 96 | # GENERATE_LEGEND =
 97 | # GENERATE_TESTLIST =
 98 | # GENERATE_TODOLIST =
 99 | # GENERATE_TREEVIEW =
100 | # GRAPHICAL_HIERARCHY =
101 | # GROUP_GRAPHS =
102 | # GROUP_NESTED_COMPOUNDS =
103 | # HIDE_COMPOUND_REFERENCE= =
104 | # HIDE_FRIEND_COMPOUNDS =
105 | # HIDE_IN_BODY_DOCS =
106 | # HIDE_SCOPE_NAMES =
107 | # HIDE_UNDOC_CLASSES =
108 | # HIDE_UNDOC_MEMBERS =
109 | # HIDE_UNDOC_RELATIONS =
110 | # HTML_COLORSTYLE_GAMMA =
111 | # HTML_COLORSTYLE_HUE =
112 | # HTML_COLORSTYLE_SAT =
113 | # HTML_DYNAMIC_SECTIONS =
114 | # HTML_EXTRA_FILES =
115 | # HTML_EXTRA_STYLESHEET =
116 | # HTML_FOOTER =
117 | # HTML_HEADER =
118 | # HTML_INDEX_NUM_ENTRIES =
119 | # HTML_STYLESHEET =
120 | # HTML_TIMESTAMP =
121 | # IDL_PROPERTY_SUPPORT =
122 | # IGNORE_PREFIX =
123 | # IMAGE_PATH =
124 | # INCLUDED_BY_GRAPH =
125 | # INCLUDE_FILE_PATTERNS =
126 | # INCLUDE_GRAPH =
127 | # INCLUDE_PATH =
128 | # INHERIT_DOCS =
129 | # INLINE_GROUPED_CLASSES =
130 | # INLINE_INFO =
131 | # INLINE_INHERITED_MEMB =
132 | # INLINE_SIMPLE_STRUCTS =
133 | # INLINE_SOURCES =
134 | # INPUT =
135 | # INPUT_ENCODING =
136 | # INTERACTIVE_SVG =
137 | # INTERNAL_DOCS =
138 | # JAVADOC_AUTOBRIEF =
139 | # LAYOUT_FILE =
140 | # MACRO_EXPANSION =
141 | # MARKDOWN_SUPPORT =
142 | # MAX_DOT_GRAPH_DEPTH =
143 | # MSCFILE_DIRS =
144 | # MULTILINE_CPP_IS_BRIEF =
145 | # OPTIMIZE_FOR_FORTRAN =
146 | # OPTIMIZE_OUTPUT_FOR_C =
147 | # OPTIMIZE_OUTPUT_JAVA =
148 | # OPTIMIZE_OUTPUT_VHDL =
149 | # OUTPUT_LANGUAGE =
150 | # PLANTUML_JAR_PATH =
151 | # PREDEFINED =
152 | # PROJECT_BRIEF =
153 | # PROJECT_LOGO =
154 | # PROJECT_NAME =
155 | # PROJECT_NUMBER =
156 | # QT_AUTOBRIEF =
157 | # RECURSIVE =
158 | # REFERENCED_BY_RELATION =
159 | # REFERENCES_LINK_SOURCE =
160 | # REFERENCES_RELATION =
161 | # REPEAT_BRIEF =
162 | # SEARCHENGINE =
163 | # SEARCH_INCLUDES =
164 | # SEPARATE_MEMBER_PAGES =
165 | # SHORT_NAMES =
166 | # SHOW_FILES =
167 | # SHOW_GROUPED_MEMB_INC =
168 | # SHOW_INCLUDE_FILES =
169 | # SHOW_NAMESPACES =
170 | # SHOW_USED_FILES =
171 | # SIP_SUPPORT =
172 | # SKIP_FUNCTION_MACROS =
173 | # SORT_BRIEF_DOCS =
174 | # SORT_BY_SCOPE_NAME =
175 | # SORT_GROUP_NAMES =
176 | # SORT_MEMBERS_CTORS_1ST =
177 | # SORT_MEMBER_DOCS =
178 | # SOURCE_BROWSER =
179 | # SOURCE_TOOLTIPS =
180 | # STRICT_PROTO_MATCHING =
181 | # STRIP_CODE_COMMENTS =
182 | # STRIP_FROM_INC_PATH =
183 | # STRIP_FROM_PATH =
184 | # SUBGROUPING =
185 | # TAB_SIZE =
186 | # TEMPLATE_RELATIONS =
187 | # TREEVIEW_WIDTH =
188 | # TYPEDEF_HIDES_STRUCT =
189 | # UML_LIMIT_NUM_FIELDS =
190 | # UML_LOOK =
191 | # USE_MDFILE_AS_MAINPAGE =
192 | # VERBATIM_HEADERS =
193 | #
194 | 
195 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | patreon: einpoklum
4 | custom: https://paypal.me/eyalroz
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | 
 3 | # Temporary, cache, swap files
 4 | \#\#* 
 5 | *.swp
 6 | *.bkp
 7 | 
 8 | # Files which "ask" to be hidden
 9 | *~
10 | .*
11 | unused/
12 | 
13 | # Build artifacts
14 | *.a
15 | *.o
16 | *.so
17 | *.ptx
18 | bin/*
19 | lib/*
20 | build/*
21 | 
22 | # Core dumps
23 | core
24 | core.*
25 | core-*
26 | 
27 | # CMake & CTest-generated files
28 | CMakeCache.txt
29 | CMakeFiles/*
30 | cmake_install.cmake
31 | CMakeScripts/*
32 | CMakeTmp/*
33 | Makefile
34 | CTestTestfile.cmake
35 | 
36 | # Eclise IDE-related files
37 | .project
38 | .cproject
39 | .settings
40 | 
41 | # CLion IDE-related files
42 | .idea/
43 | cmake-build-*/
44 | 
45 | # Patching
46 | *.diff
47 | *.rej
48 | *.orig
49 | 
50 | # Files/folders downloaded from other repositories as part of the build
51 | external/*
52 | third-party/*
53 | 
54 | # Miscellaneous
55 | tags
56 | log
57 | *.log
58 | *.v3breakpoints
59 | gmon.out
60 | .DS_Store
61 | 
62 | # Doxygen
63 | doxygen.log
64 | Doxyfile
65 | docs/
66 | 
67 | # Archives
68 | *.zip
69 | *.gz
70 | *.bz2
71 | *.tgz
72 | *.tar
73 | *.xz
74 | 
75 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # Note:
  3 | #
  4 | # cuda-kat is a _header-only_ library. You can't build it, and you don't need
  5 | # to run CMake in order to use it or install it. Just add the `src/` directory
  6 | # to your include path (or copy its contents to some `include/` directory.
  7 | # cuda-kat only depends on having a C++11 compiler and the CUDA toolkit
  8 | # installed.
  9 | #
 10 | # This file is provided mostly in order to build the library unit tests.
 11 | 
 12 | cmake_minimum_required(VERSION 3.8.2)
 13 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 14 | 
 15 | project(cuda-kat
 16 | 	DESCRIPTION "CUDA kernel author's tools"
 17 | 	VERSION 0.1.0
 18 | 	HOMEPAGE_URL "https://github.com/eyalroz/cuda-kat"
 19 | 	LANGUAGES CXX CUDA)
 20 | 
 21 | ###############
 22 | ##  Modules  ##
 23 | ###############
 24 | 
 25 | # Standard CMake modules
 26 | 
 27 | # Custom modules
 28 | 
 29 | ############################
 30 | ##  Package dependencies  ##
 31 | ############################
 32 | 
 33 | # cuda-kat can't use the standard library's string formatting and output stream code,
 34 | # because most of it is host-side only; and it doesn't make sense to bundle a modified
 35 | # half of the standard library just for that. Instead, we use the strf library
 36 | # (available at: https://github.com/robhz786/strf )
 37 | find_package(strf 0.10.4)
 38 | 
 39 | ###############
 40 | ##  OPTIONS  ##
 41 | ###############
 42 | 
 43 | #message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 44 | 
 45 | set(BUILD_TESTS FALSE CACHE BOOL "Build tests for the library")
 46 | 
 47 | ###############
 48 | ##  Targets  ##
 49 | ###############
 50 | 
 51 | add_library(cuda-kat INTERFACE)
 52 | target_include_directories(
 53 | 	cuda-kat
 54 | 	INTERFACE
 55 | 	"$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/src>"
 56 | 	"$<INSTALL_INTERFACE:include>"
 57 | )
 58 | 
 59 | 
 60 | # TODO: Consider enabling the following command. It helps IDEs
 61 | # notice the library's header files even if they're not currently
 62 | # in use.
 63 | #
 64 | #target_sources(cuda-kat
 65 | #	src/kat/on_device/time.cuh
 66 | #	src/kat/on_device/shared_memory.cuh
 67 | #	 etc. etc.
 68 | 
 69 | 
 70 | #############
 71 | ##  Tests  ##
 72 | #############
 73 | 
 74 | if(BUILD_TESTS)
 75 | 	enable_testing()
 76 | 	# set(TEST_RUNNER_PARAMS "--force-colors=true" CACHE STRING "Options to add to our test runners commands")
 77 | 	add_subdirectory(tests)
 78 | endif()
 79 | 
 80 | 
 81 | ####################
 82 | ##  Installation  ##
 83 | ####################
 84 | 
 85 | include(GNUInstallDirs)
 86 | 
 87 | install(
 88 | 	TARGETS cuda-kat
 89 | 	EXPORT cuda-kat_export
 90 | 	INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 91 | )
 92 | 
 93 | install(
 94 | 	DIRECTORY src/kat
 95 | 	DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 96 | 	FILES_MATCHING REGEX "\\.(h|hpp|cuh)$"
 97 | )
 98 | 
 99 | install(
100 | 	EXPORT cuda-kat_export
101 | 	DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-kat"
102 | 	NAMESPACE "cuda-kat::"
103 | 	FILE cuda-kat-config.cmake
104 | )
105 | 
106 | include(CMakePackageConfigHelpers)
107 | 
108 | write_basic_package_version_file(
109 | 	"cuda-kat-config-version.cmake"
110 | 	VERSION ${PROJECT_VERSION}
111 | 	COMPATIBILITY SameMinorVersion
112 | )
113 | 
114 | install(
115 |   FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-kat-config-version.cmake"
116 |   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-kat"
117 | )
118 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Eyal Rozenberg
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | 


--------------------------------------------------------------------------------
/cmake/Modules/CompileWithWarnings.cmake:
--------------------------------------------------------------------------------
 1 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
 2 |     set(WARNING_FLAGS "-Wall -Wextra -Wpedantic -Wno-missing-field-initializers")
 3 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 4 |     set(WARNING_FLAGS "-Wall -Wextra -Wpedantic -Wno-missing-field-initializers")
 5 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
 6 |     set(WARNING_FLAGS "-w3 -wd1418,2259")
 7 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 8 |     set(WARNING_FLAGS "/W4")
 9 | else ()
10 |     message(WARNING "Unknown compiler - cannot set warning flags")
11 | endif()
12 | 
13 | if(WARNING_FLAGS)
14 | 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}")
15 | endif()
16 | 
17 | 


--------------------------------------------------------------------------------
/cmake/Modules/DocTest.cmake:
--------------------------------------------------------------------------------
1 | add_library(doctest INTERFACE)
2 | set(DOCTEST_DIR "${PROJECT_SOURCE_DIR}/external/doctest/")
3 | target_sources(doctest INTERFACE ${DOCTEST_DIR}/doctest.h) # Is this needed?
4 | target_include_directories(doctest INTERFACE ${DOCTEST_DIR})
5 | 
6 | 


--------------------------------------------------------------------------------
/cmake/Modules/Findcuda-nvtx.cmake:
--------------------------------------------------------------------------------
1 | find_library(CUDA_NVTX_LIBRARY
2 |   NAMES nvToolsExt nvTools nvtoolsext nvtools nvtx NVTX
3 |   PATHS ${CUDA_TOOLKIT_ROOT_DIR}
4 |   PATH_SUFFIXES "lib64" "common/lib64" "common/lib" "lib"
5 |   DOC "Location of the CUDA Toolkit Extension (NVTX) library"
6 |   )
7 | mark_as_advanced(CUDA_NVTX_LIBRARY)
8 | 
9 | 


--------------------------------------------------------------------------------
/src/cuda-kat.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file cuda-kat.cuh shortcut for including all functionality of the cuda-kat library.
 3 |  */
 4 | #ifndef CUDA_KAT_CUH_
 5 | #define CUDA_KAT_CUH_
 6 | 
 7 | #include "kat/containers/array.hpp"
 8 | #include "kat/on_device/c_standard_library/string.cuh"
 9 | #include "kat/on_device/constexpr_math.cuh"
10 | #include "kat/on_device/grid_info.cuh"
11 | #include "kat/on_device/math.cuh"
12 | #include "kat/on_device/miscellany.cuh"
13 | #include "kat/on_device/non-builtins.cuh"
14 | #include "kat/on_device/printing.cuh"
15 | #include "kat/on_device/ptx.cuh"
16 | #include "kat/on_device/shared_memory.cuh"
17 | #include "kat/on_device/unaligned.cuh"
18 | #include "kat/on_device/atomics.cuh"
19 | #include "kat/on_device/builtins.cuh"
20 | #include "kat/on_device/shuffle.cuh"
21 | #include "kat/on_device/collaboration/warp.cuh"
22 | #include "kat/on_device/collaboration/block.cuh"
23 | #include "kat/on_device/collaboration/grid.cuh"
24 | #include "kat/on_device/sequence_ops/warp.cuh"
25 | #include "kat/on_device/sequence_ops/block.cuh"
26 | 
27 | #endif /* CUDA_KAT_CUH_ */
28 | 
29 | 


--------------------------------------------------------------------------------
/src/kat/common.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file kat/common.hpp
 3 |  *
 4 |  * @brief Basic type and macro definitions used throughout the KAT library.
 5 |  */
 6 | #pragma once
 7 | #ifndef CUDA_KAT_COMMON_HPP_
 8 | #define CUDA_KAT_COMMON_HPP_
 9 | 
10 | #include <cstddef> // for std::size_t
11 | #include <type_traits>
12 | 
13 | #include <kat/detail/execution_space_specifiers.hpp>
14 | #include <kat/detail/constexpr_by_cpp_version.hpp>
15 | 
16 | namespace kat {
17 | 
18 | /**
19 |  * Used throughout the kat library for (non-negative) sizes and lengths
20 |  * of containers, memory regions and so on - on both the host and the device
21 |  * side.
22 |  *
23 |  * @note CUDA isn't explicit about this, but it also uses the standard library's
24 |  * size_t occasionally.
25 |  */
26 | using size_t = std::size_t;
27 | 
28 | #if __cplusplus < 201703L
29 | 
30 | // Some C++17 type traits definable in C++11
31 | 
32 | template<typename ...               > struct conjunction : std::true_type {};
33 | template<typename B                 > struct conjunction<B> : B {};
34 | template<typename B, typename ... Bs> struct conjunction<B, Bs...> : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {};
35 | 
36 | template<typename ...               > struct disjunction : std::true_type {};
37 | template<typename B                 > struct disjunction<B> : B {};
38 | template<typename B, typename ... Bs> struct disjunction<B, Bs...> : std::conditional<bool(B::value), disjunction<Bs...>, B>::type {};
39 | 
40 | template <bool B> using bool_constant = std::integral_constant<bool, B>;
41 | 
42 | template<typename B> struct negation : bool_constant<not bool(B::value)> {};
43 | 
44 | #else
45 | 
46 | template <typename ... Bs> using conjunction   = std::conjunction<Bs...>;
47 | template <typename ... Bs> using disjunction   = std::disjunction<Bs...>;
48 | template <bool B> using bool_constant = std::bool_constant<B>;
49 | template <bool B> using negation      = std::negation<B>;
50 | 
51 | 
52 | #endif
53 | 
54 | 
55 | template<typename T, typename... Ts>
56 | using is_any_of = disjunction<std::is_same<T, Ts>...>;
57 | 
58 | /*
59 | template<typename T, typename First>
60 | struct is_any_of<T, First>
61 | 	: std::is_same<T, First> {};
62 | 
63 | template<typename T, typename T1, typename T2>
64 | struct is_any_of<T, T1, T2>
65 | 	: bool_constant<std::is_same<T, T1>::value or std::is_same<T, T2>::value> {};
66 | 
67 | template<typename T, typename T1, typename T2, typename T3>
68 | struct is_any_of<T, T1, T2, T3>
69 | 	: bool_constant<std::is_same<T, T1>::value or std::is_same<T, T2>::value or std::is_same<T, T2>::value> {};
70 | 
71 | template<typename T, typename T1, typename T2, typename T3, typename... Rest>
72 | struct is_any_of<T, T1, T2, T3, Rest...>
73 |     : bool_constant<std::is_same<T, T1, T2, T3>::value or is_any_of<T, Rest...>::value> {};
74 | */
75 | 
76 | } // namespace kat
77 | 
78 | #endif // CUDA_KAT_COMMON_HPP_
79 | 


--------------------------------------------------------------------------------
/src/kat/detail/constexpr_by_cpp_version.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef CUDA_KAT_CONSTEXPR_BY_CPP_VERSION_HPP_
 3 | #define CUDA_KAT_CONSTEXPR_BY_CPP_VERSION_HPP_
 4 | 
 5 | ///@cond
 6 | 
 7 | #if __cplusplus < 201103L
 8 | #error "C++11 or newer is required to use this header"
 9 | #endif
10 | 
11 | #ifndef CONSTEXPR_SINCE_CPP_14
12 | #if __cplusplus >= 201402L
13 | #define CONSTEXPR_SINCE_CPP_14 constexpr
14 | #else
15 | #define CONSTEXPR_SINCE_CPP_14
16 | #endif
17 | #endif
18 | 
19 | #ifndef CONSTEXPR_SINCE_CPP_17
20 | #if __cplusplus >= 201701L
21 | #define CONSTEXPR_SINCE_CPP_17 constexpr
22 | #else
23 | #define CONSTEXPR_SINCE_CPP_17
24 | #endif
25 | #endif
26 | 
27 | ///@endcond
28 | 
29 | #endif // CUDA_KAT_CONSTEXPR_BY_CPP_VERSION_HPP_
30 | 


--------------------------------------------------------------------------------
/src/kat/detail/execution_space_specifiers.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file execution_space_specifiers.hpp
 3 |  *
 4 |  * @brief Some functions need a specification of their appropriate execution space w.r.t. the
 5 |  * CUDA device-vs-host-side side, as well as their inlining requirement. For brevity,
 6 |  * we introduce shorthands for these.
 7 |  */
 8 | 
 9 | #ifndef EXECUTION_SPACE_SPECIFIERS_HPP_
10 | #define EXECUTION_SPACE_SPECIFIERS_HPP_
11 | 
12 | ///@cond
13 | 
14 | #ifdef __CUDACC__
15 | 
16 | #ifndef KAT_FD
17 | #define KAT_FD  __forceinline__ __device__
18 | #endif
19 | 
20 | #ifndef KAT_FH
21 | #define KAT_FH  __forceinline__ __host__
22 | #endif
23 | 
24 | #ifndef KAT_FHD
25 | #define KAT_FHD __forceinline__ __host__ __device__
26 | #endif
27 | 
28 | #ifndef KAT_ID
29 | #define KAT_ID  inline __device__
30 | #endif
31 | 
32 | #ifndef KAT_IH
33 | #define KAT_IH  inline __host__
34 | #endif
35 | 
36 | #ifndef KAT_IHD
37 | #define KAT_IHD inline __host__ __device__
38 | #endif
39 | 
40 | #ifndef KAT_HD
41 | #define KAT_HD __host__ __device__
42 | #endif
43 | 
44 | #ifndef KAT_DEV
45 | #define KAT_DEV __device__
46 | #endif
47 | 
48 | #ifndef KAT_HOST
49 | #define KAT_HOST __host__
50 | #endif
51 | 
52 | #else // __CUDACC__
53 | 
54 | #ifndef KAT_FD
55 | #define KAT_FD inline
56 | #endif
57 | 
58 | #ifndef KAT_FH
59 | #define KAT_FH inline
60 | #endif
61 | 
62 | #ifndef KAT_FHD
63 | #define KAT_FHD inline
64 | #endif
65 | 
66 | #ifndef KAT_ID
67 | #define KAT_ID inline
68 | #endif
69 | 
70 | #ifndef KAT_IH
71 | #define KAT_IH inline
72 | #endif
73 | 
74 | #ifndef KAT_IHD
75 | #define KAT_IHD inline
76 | #endif
77 | 
78 | #ifndef KAT_HD
79 | #define KAT_HD
80 | #endif
81 | 
82 | #ifndef KAT_DEV
83 | #define KAT_DEV
84 | #endif
85 | 
86 | #ifndef KAT_HOST
87 | #define KAT_HOST
88 | #endif
89 | 
90 | #endif // __CUDACC__
91 | 
92 | ///@endcond
93 | 
94 | 
95 | #endif // EXECUTION_SPACE_SPECIFIERS_HPP_
96 | 


--------------------------------------------------------------------------------
/src/kat/detail/integer_sequence.hpp:
--------------------------------------------------------------------------------
  1 | ///////////////////////////////////////////////////////////////////////////////
  2 | //  Copyright (c)      2018 NVIDIA Corporation
  3 | //  Copyright (c) 2015-2018 Bryce Adelstein Lelbach aka wash
  4 | //  Copyright (c) 2020 Eyal Rozenberg
  5 | //
  6 | //  Distributed under the Boost Software License, Version 1.0. (See copy
  7 | //  at http://www.boost.org/LICENSE_1_0.txt)
  8 | ///////////////////////////////////////////////////////////////////////////////
  9 | 
 10 | /** @file integer_sequence.hpp
 11 |  *
 12 |  * @brief An implementation of the C++14 standard library's `integer_sequence`
 13 |  * and associated helper aliases plus some extensions. Copied fromm NVIDIA's
 14 |  * thrust library's file `integer_sequence.h`, 2020-03-11.
 15 |  */
 16 | 
 17 | #ifndef CUDA_KAT_INTEGER_SEQUENCE_HPP_
 18 | #define CUDA_KAT_INTEGER_SEQUENCE_HPP_
 19 | 
 20 | #include <kat/common.hpp>
 21 | 
 22 | #include <type_traits>
 23 | #include <utility>
 24 | #include <cstdint>
 25 | 
 26 | namespace kat {
 27 | 
 28 | #if __cplusplus >= 201402L
 29 | 
 30 | // A compile-time sequence of integral constants of type T.
 31 | template <typename T, T... Is>
 32 | using integer_sequence = std::integer_sequence<T, Is...>;
 33 | 
 34 | // A compile-time sequence of size_t constants.
 35 | template <size_t... Is>
 36 | using index_sequence = std::index_sequence<Is...>;
 37 | 
 38 | // Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
 39 | template <typename T, size_t N>
 40 | using make_integer_sequence = std::make_integer_sequence<T, N>;
 41 | 
 42 | // Create a new index_sequence with elements 0, 1, 2, ..., N - 1.
 43 | template <size_t N>
 44 | using make_index_sequence = std::make_index_sequence<N>;
 45 | 
 46 | ///////////////////////////////////////////////////////////////////////////////
 47 | 
 48 | #else // Older than C++14.
 49 | 
 50 | // A compile-time sequence of integral constants of type T.
 51 | template <typename T, T... Is>
 52 | struct integer_sequence;
 53 | 
 54 | // A compile-time sequence of size_t constants.
 55 | template <size_t... Is>
 56 | using index_sequence = integer_sequence<size_t, Is...>;
 57 | 
 58 | ///////////////////////////////////////////////////////////////////////////////
 59 | 
 60 | namespace detail
 61 | {
 62 | 
 63 | // Create a new integer_sequence containing the elements of Sequence0 followed
 64 | // by the elements of Sequence1. Sequence0::size() is added to each element from
 65 | // Sequence1 in the new sequence.
 66 | template <typename Sequence0, typename Sequence1>
 67 |   struct merge_and_renumber_integer_sequences_impl;
 68 | template <typename Sequence0, typename Sequence1>
 69 |   using merge_and_renumber_integer_sequences =
 70 |       typename merge_and_renumber_integer_sequences_impl<
 71 |           Sequence0, Sequence1
 72 |       >::type;
 73 | 
 74 | // Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
 75 | template <typename T, size_t N>
 76 |   struct make_integer_sequence_impl;
 77 | 
 78 | 
 79 | } // namespace detail
 80 | 
 81 | ///////////////////////////////////////////////////////////////////////////////
 82 | 
 83 | // Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
 84 | template <typename T, size_t N>
 85 | using make_integer_sequence =
 86 |   typename detail::make_integer_sequence_impl<T, N>::type;
 87 | 
 88 | // Create a new index_sequence with elements 0, 1, 2, ..., N - 1.
 89 | template <size_t N>
 90 | using make_index_sequence =
 91 |   make_integer_sequence<size_t, N>;
 92 | 
 93 | ///////////////////////////////////////////////////////////////////////////////
 94 | 
 95 | template <typename T, T... Is>
 96 | struct integer_sequence
 97 | {
 98 |   using type = integer_sequence;
 99 |   using value_type = T;
100 |   using size_type = size_t;
101 | 
102 |   KAT_HD
103 |   static constexpr size_type size() noexcept
104 |   {
105 |     return sizeof...(Is);
106 |   }
107 | };
108 | ///////////////////////////////////////////////////////////////////////////////
109 | 
110 | namespace detail
111 | {
112 | 
113 | template <typename T, T... Is0, T... Is1>
114 | struct merge_and_renumber_integer_sequences_impl<
115 |   integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
116 | >
117 | {
118 |   using type = integer_sequence<T, Is0..., (sizeof...(Is0) + Is1)...>;
119 | };
120 | 
121 | ///////////////////////////////////////////////////////////////////////////////
122 | 
123 | template <typename T, size_t N>
124 | struct make_integer_sequence_impl
125 | {
126 |   using type = merge_and_renumber_integer_sequences<
127 |     make_integer_sequence<T, N / 2>
128 |   , make_integer_sequence<T, N - N / 2>
129 |   >;
130 | };
131 | 
132 | template <typename T>
133 | struct make_integer_sequence_impl<T, 0>
134 | {
135 |   using type = integer_sequence<T>;
136 | };
137 | 
138 | template <typename T>
139 | struct make_integer_sequence_impl<T, 1>
140 | {
141 |   using type = integer_sequence<T, 0>;
142 | };
143 | 
144 | } // namespace detail
145 | 
146 | #endif // THRUST_CPP_DIALECT >= 2014
147 | 
148 | ///////////////////////////////////////////////////////////////////////////////
149 | 
150 | namespace detail
151 | {
152 | 
153 | // Create a new integer_sequence containing the elements of Sequence0 followed
154 | // by the elements of Sequence1. Sequence1::size() is added to each element from
155 | // Sequence0 in the new sequence.
156 | template <typename Sequence0, typename Sequence1>
157 |   struct merge_and_renumber_reversed_integer_sequences_impl;
158 | template <typename Sequence0, typename Sequence1>
159 |   using merge_and_renumber_reversed_integer_sequences =
160 |       typename merge_and_renumber_reversed_integer_sequences_impl<
161 |           Sequence0, Sequence1
162 |       >::type;
163 | 
164 | // Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0.
165 | template <typename T, size_t N>
166 | struct make_reversed_integer_sequence_impl;
167 | 
168 | // Add a new element to the front of an integer_sequence<>.
169 | template <typename T, T I, typename Sequence>
170 | struct integer_sequence_push_front_impl;
171 | 
172 | // Add a new element to the back of an integer_sequence<>.
173 | template <typename T, T I, typename Sequence>
174 | struct integer_sequence_push_back_impl;
175 | 
176 | }
177 | 
178 | ///////////////////////////////////////////////////////////////////////////////
179 | 
180 | // Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0.
181 | template <typename T, size_t N>
182 | using make_reversed_integer_sequence =
183 |   typename detail::make_reversed_integer_sequence_impl<T, N>::type;
184 | 
185 | // Create a new index_sequence with elements N - 1, N - 2, N - 3, ..., 0.
186 | template <size_t N>
187 | using make_reversed_index_sequence =
188 |   make_reversed_integer_sequence<size_t, N>;
189 | 
190 | // Add a new element to the front of an integer_sequence<>.
191 | template <typename T, T I, typename Sequence>
192 | using integer_sequence_push_front =
193 |   typename detail::integer_sequence_push_front_impl<T, I, Sequence>::type;
194 | 
195 | // Add a new element to the back of an integer_sequence<>.
196 | template <typename T, T I, typename Sequence>
197 | using integer_sequence_push_back =
198 |   typename detail::integer_sequence_push_back_impl<T, I, Sequence>::type;
199 | 
200 | ///////////////////////////////////////////////////////////////////////////////
201 | 
202 | namespace detail
203 | {
204 | 
205 | template <typename T, T... Is0, T... Is1>
206 | struct merge_and_renumber_reversed_integer_sequences_impl<
207 |   integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
208 | >
209 | {
210 |   using type = integer_sequence<T, (sizeof...(Is1) + Is0)..., Is1...>;
211 | };
212 | 
213 | ///////////////////////////////////////////////////////////////////////////////
214 | 
215 | template <typename T, size_t N>
216 | struct make_reversed_integer_sequence_impl
217 | {
218 |   using type = merge_and_renumber_reversed_integer_sequences<
219 |       make_reversed_integer_sequence<T, N / 2>
220 |     , make_reversed_integer_sequence<T, N - N / 2>
221 |   >;
222 | };
223 | 
224 | ///////////////////////////////////////////////////////////////////////////////
225 | 
226 | template <typename T>
227 | struct make_reversed_integer_sequence_impl<T, 0>
228 | {
229 |   using type = integer_sequence<T>;
230 | };
231 | 
232 | template <typename T>
233 | struct make_reversed_integer_sequence_impl<T, 1>
234 | {
235 |   using type = integer_sequence<T, 0>;
236 | };
237 | 
238 | ///////////////////////////////////////////////////////////////////////////////
239 | 
240 | template <typename T, T I0, T... Is>
241 | struct integer_sequence_push_front_impl<T, I0, integer_sequence<T, Is...> >
242 | {
243 |   using type = integer_sequence<T, I0, Is...>;
244 | };
245 | 
246 | ///////////////////////////////////////////////////////////////////////////////
247 | 
248 | template <typename T, T I0, T... Is>
249 | struct integer_sequence_push_back_impl<T, I0, integer_sequence<T, Is...> >
250 | {
251 |   using type = integer_sequence<T, Is..., I0>;
252 | };
253 | 
254 | ///////////////////////////////////////////////////////////////////////////////
255 | 
256 | } // namespace detail
257 | 
258 | } // namespace kat
259 | 
260 | #endif // CUDA_KAT_INTEGER_SEQUENCE_HPP_
261 | 
262 | 


--------------------------------------------------------------------------------
/src/kat/detail/pointers.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #ifndef CUDA_KAT_POINTERS_CUH_
  3 | #define CUDA_KAT_POINTERS_CUH_
  4 | 
  5 | #include <type_traits>
  6 | 
  7 | 
  8 | ///@cond
  9 | #include <kat/detail/execution_space_specifiers.hpp>
 10 | ///@endcond
 11 | 
 12 | namespace kat {
 13 | namespace detail {
 14 | 
 15 | static constexpr auto obj_ptr_size { sizeof(void *)     };
 16 | static constexpr auto fun_ptr_size { sizeof(void (*)()) };
 17 | //auto dat_mem_ptr_size = sizeof(generic_dat_mem_ptr_t);
 18 | //auto mem_fun_size = sizeof(generic_mem_fun_ptr_t);
 19 | 
 20 | //auto max_ptr_size = std::max({ sizeof(generic_obj_ptr_t), sizeof(generic_fun_ptr_t), sizeof(generic_dat_mem_ptr_t), sizeof(generic_mem_fun_ptr_t) });
 21 | //auto max_ptr_align = std::max({ alignof(generic_obj_ptr_t), alignof(generic_fun_ptr_t), alignof(generic_dat_mem_ptr_t), alignof(generic_mem_fun_ptr_t) });
 22 | 
 23 | static constexpr auto max_ptr_size { (obj_ptr_size > fun_ptr_size) ? obj_ptr_size : fun_ptr_size };
 24 | 
 25 | static_assert(max_ptr_size == sizeof(uint64_t), "Unexpected maximum pointer size");
 26 | 
 27 | using address_t = uint64_t;
 28 | 
 29 | //KAT_FHD address_t address_as_number (address_t       address) { return address; }
 30 | template <typename T>
 31 | constexpr KAT_FHD address_t address_as_number (const T*  address) noexcept { return reinterpret_cast<address_t>(address); }
 32 | template <typename T>
 33 | constexpr KAT_FHD T*        address_as_pointer(address_t address) noexcept { return reinterpret_cast<T*>(address); }
 34 | 
 35 | template <typename T1, typename T2>
 36 | KAT_FHD std::ptrdiff_t address_difference(T1* p1, T2* p2)
 37 | {
 38 | 	return address_as_number(p1) - address_as_number(p2);
 39 | }
 40 | 
 41 | 
 42 | // TODO: Code duplication with math.cuh
 43 | template <typename I>
 44 | constexpr KAT_FHD bool is_power_of_2(I val) { return (val & (val-1)) == 0; }
 45 | 
 46 | template <typename T>
 47 | constexpr KAT_FHD address_t misalignment_extent(address_t address) noexcept
 48 | {
 49 | 	static_assert(is_power_of_2(sizeof(T)),"Invalid type for alignment");
 50 | 	constexpr address_t mask = sizeof(T) - 1; // utilizing the fact that it's a power of 2
 51 | 	return address & mask;
 52 | }
 53 | 
 54 | /**
 55 |  * Computes the number of bytes by which a pointer is misaligned.
 56 |  *
 57 |  * @tparam T  The pointer-to element type; its size must be a power of 2.
 58 |  *
 59 |  * @param ptr The possibly-misaligned pointer
 60 |  * @return the minimum number of bytes which, if deducted from ptr, produces
 61 |  * a T-aligned pointer
 62 |  */
 63 | template <typename T, typename U = T>
 64 | constexpr KAT_FHD address_t misalignment_extent(const U* ptr) noexcept
 65 | {
 66 | 	return misalignment_extent<T>(address_as_number(ptr));
 67 | }
 68 | 
 69 | template <typename T, typename U = T>
 70 | constexpr KAT_FHD bool is_aligned(const U* ptr) noexcept
 71 | {
 72 | 	return misalignment_extent<T>(reinterpret_cast<const T*>(ptr)) == 0;
 73 | }
 74 | 
 75 | template <typename T>
 76 | constexpr KAT_FHD bool is_aligned(address_t address) noexcept
 77 | {
 78 | 	return misalignment_extent<T>(address) == 0;
 79 | }
 80 | 
 81 | template <typename T>
 82 | constexpr KAT_FHD address_t align_down(address_t address) noexcept
 83 | {
 84 | 	return (address - misalignment_extent<T>(address));
 85 | }
 86 | 
 87 | template <typename T>
 88 | constexpr KAT_FHD address_t align_up(address_t address) noexcept
 89 | {
 90 | 	return address + is_aligned<T>(address) ? 0 : misalignment_extent<T>(address);
 91 | }
 92 | 
 93 | /**
 94 |  * @tparam T a type whose size is a power of 2 (and thus has natural alignment)
 95 |  * @param A possibly-unaligned pointer
 96 |  * @return A pointer to the closest aligned T in memory upto and including @p ptr
 97 |  */
 98 | template <typename AlignBy, typename T>
 99 | KAT_FHD AlignBy* align_down(T* ptr) noexcept
100 | {
101 | 	// Note: The compiler _should_ optimize out the inefficiency of using
102 | 	// misalignment_extent rather than just applying a mask once.
103 | 	address_t address = address_as_number(ptr);
104 | 	auto aligned_addr = align_down<AlignBy>(address);
105 | 	return (AlignBy*) address_as_pointer<AlignBy*>(aligned_addr);
106 | }
107 | 
108 | template <typename AlignBy, typename T>
109 | KAT_FHD AlignBy* align_up(T* ptr) noexcept
110 | {
111 | 	address_t address = address_as_number(ptr);
112 | 	auto aligned_addr = align_up<AlignBy>(address);
113 | 	return (AlignBy*) address_as_pointer<AlignBy*>(aligned_addr);
114 | }
115 | 
116 | template <typename T>
117 | KAT_FHD T* align_down(T* ptr) noexcept
118 | {
119 | 	return const_cast<T*>(align_down<T>(reinterpret_cast<const T*>(ptr)));
120 | }
121 | 
122 | template <typename T>
123 | KAT_FHD T* align_up(T* ptr) noexcept
124 | {
125 | 	return const_cast<T*>(align_up<T>(reinterpret_cast<const T*>(ptr)));
126 | }
127 | 
128 | 
129 | } // namespace detail
130 | } // namespace kat
131 | 
132 | #include <kat/detail/execution_space_specifiers.hpp>
133 | 
134 | #endif // CUDA_KAT_POINTERS_CUH_
135 | 


--------------------------------------------------------------------------------
/src/kat/on_device/atomics.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/common.cuh
  3 |  *
  4 |  * @brief Type-generic wrappers for CUDA atomic operations
  5 |  *
  6 |  * CUDA's atomic "primitive" atomic functions are non-generic C functions,
  7 |  * defined only for some specific types - and sometimes only for some of the
  8 |  * types of the same size for which semantics are identical. In this file
  9 |  * are found type-generic variants of these same function, with functionality
 10 |  * extended as much as possible - either through recasting or using
 11 |  * the compare-and-swap (compare-and-exchange) primitive to implement other
 12 |  * functions for types not directly supported.
 13 |  *
 14 |  * Additionally, the wrapper used for emulating atomics on arbitrary types
 15 |  * is made available here for the user to be able to do the same for
 16 |  * arbitrary functions.
 17 |  *
 18 |  * @note nVIDIA makes a rather unfortunate and non-intuitive choice of parameter
 19 |  * names for its atomic functions, which - at least for now, and for the sake of
 20 |  * consistency - I adopt: they call a pointer an "address", and they call the
 21 |  * new value "val" even if there is another value to consider (e.g. atomicCAS).
 22 |  * Also, what's with the shorthand? Did you run out of disk space? :-(
 23 |  */
 24 | 
 25 | #ifndef CUDA_KAT_ON_DEVICE_ATOMICS_CUH_
 26 | #define CUDA_KAT_ON_DEVICE_ATOMICS_CUH_
 27 | 
 28 | #include <limits>
 29 | 
 30 | ///@cond
 31 | #include <kat/detail/execution_space_specifiers.hpp>
 32 | ///@endcond
 33 | 
 34 | namespace kat {
 35 | namespace atomic {
 36 | 
 37 | template <typename T>  KAT_FD T add        (T* address, T val);
 38 | template <typename T>  KAT_FD T subtract   (T* address, T val);
 39 | template <typename T>  KAT_FD T exchange   (T* address, T val);
 40 | template <typename T>  KAT_FD T min        (T* address, T val);
 41 | template <typename T>  KAT_FD T max        (T* address, T val);
 42 | template <typename T>  KAT_FD T logical_and(T* address, T val);
 43 | template <typename T>  KAT_FD T logical_or (T* address, T val);
 44 | template <typename T>  KAT_FD T logical_not(T* address);
 45 | template <typename T>  KAT_FD T logical_xor(T* address, T val);
 46 | template <typename T>  KAT_FD T bitwise_or (T* address, T val);
 47 | template <typename T>  KAT_FD T bitwise_and(T* address, T val);
 48 | template <typename T>  KAT_FD T bitwise_xor(T* address, T val);
 49 | template <typename T>  KAT_FD T bitwise_not(T* address);
 50 | template <typename T>  KAT_FD T set_bit    (T* address, native_word_t bit_index);
 51 | template <typename T>  KAT_FD T unset_bit  (T* address, native_word_t bit_index);
 52 | /**
 53 |  * @brief Increment the value at @p address by 1 - but if it reaches or surpasses @p wraparound_value, set it to 0.
 54 |  *
 55 |  * @note repeated invocations of this function will cycle through the range of values 0... @p wraparound_values - 1; thus
 56 |  * as long as the existing value is within that range, this is a simple incrementation modulo @p wraparound_value.
 57 |  */
 58 | template <typename T>  KAT_FD T increment  (T* address, T modulus = std::numeric_limits<T>::max());
 59 | /**
 60 |  * @brief Decrement the value at @p address by 1 - but if it reaches 0, or surpasses @p wraparound_value, it is set
 61 |  * to @p wrarparound_value - 1.
 62 |  *
 63 |  * @note repeated invocations of this function will cycle backwards through the range of values 0...
 64 |  * @p wraparound_values - 1; thus as long as the existing value is within that range, this is a simple decrementation
 65 |  * modulo @p wraparound_value.
 66 |  */
 67 | template <typename T>  KAT_FD T decrement  (T* address, T modulus = std::numeric_limits<T>::max());
 68 | 
 69 | 
 70 | // Note: We let this one take a const reference
 71 | template <typename T>  KAT_FD T compare_and_swap(
 72 |     T*       address,
 73 |     const T  compare,
 74 |     const T  val);
 75 | 
 76 | 
 77 | /**
 78 |  * Use atomic compare-and-swap to apply a unary function to some value,
 79 |  * replacing it at its memory location with the result before anything
 80 |  * else changes it.
 81 |  *
 82 |  * @return The new value which was stored in memory
 83 |  */
 84 | template <typename UnaryFunction, typename T>
 85 | KAT_FD T apply_atomically(UnaryFunction f, T* address);
 86 | 
 87 | /**
 88 |  * Use atomic compare-and-swap to apply a binary function to two values,
 89 |  * replacing the first at its memory location with the result before anything
 90 |  * else changes it.
 91 |  *
 92 |  * @return The new value which was stored in memory
 93 |  */
 94 | template <typename Function, typename T, typename... Ts>
 95 | KAT_FD T apply_atomically(
 96 | 	Function                f,
 97 | 	T*       __restrict__   address,
 98 | 	const Ts...             xs);
 99 | 
100 | 
101 | } // namespace atomic
102 | } // namespace kat
103 | 
104 | #include "detail/atomics.cuh"
105 | 
106 | #endif // CUDA_KAT_ON_DEVICE_ATOMICS_CUH_
107 | 


--------------------------------------------------------------------------------
/src/kat/on_device/collaboration/grid.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/collaboration/grid.cuh
  3 |  *
  4 |  * @brief CUDA device computation grid-level primitives, i.e. those involving
  5 |  * interaction of threads from different blocks in the grid
  6 |  *
  7 |  */
  8 | 
  9 | #pragma once
 10 | #ifndef CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_GRID_CUH_
 11 | #define CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_GRID_CUH_
 12 | 
 13 | #include "warp.cuh"
 14 | 
 15 | #include <kat/on_device/common.cuh>
 16 | #include <kat/on_device/math.cuh>
 17 | #include <kat/on_device/grid_info.cuh>
 18 | 
 19 | #include <type_traits>
 20 | 
 21 | 
 22 | ///@cond
 23 | #include <kat/detail/execution_space_specifiers.hpp>
 24 | ///@endcond
 25 | 
 26 | namespace kat {
 27 | namespace linear_grid {
 28 | namespace collaborative {
 29 | namespace grid {
 30 | 
 31 | // If we want to refer to other primitives, we'll make those references explicit;
 32 | // but we do want to be able to say `warp::index()` without prefixing that with anything.
 33 | 
 34 | namespace grid   = kat::linear_grid::grid_info::grid;
 35 | namespace block  = kat::linear_grid::grid_info::block;
 36 | namespace warp   = kat::linear_grid::grid_info::warp;
 37 | namespace thread = kat::linear_grid::grid_info::thread;
 38 | namespace lane   = kat::linear_grid::grid_info::lane;
 39 | 
 40 | /**
 41 |  * Have all kernel threads perform some action over the linear range
 42 |  * of 0..length-1, at strides equal to the grid length, i.e. a thread
 43 |  * with index i_t in block with index i_b, where block lengths are n_b,
 44 |  * will perform the action on elements i_t, i_t + n_b, i_t + 2*n_b, and
 45 |  * so on.
 46 |  *
 47 |  * Thus, if in the following chart the rectangles represent
 48 |  * consecutive segments of n_b integers, the numbers
 49 |  * indicate which blocks work on which elements in "grid stride":
 50 |  *
 51 |  *   -------------------------------------------------------
 52 |  *  |   1   |  222  |  333  |   1   |  222  |  333  |   1   |
 53 |  *  |  11   | 2   2 | 3   3 |  11   | 2   2 | 3   3 |  11   |
 54 |  *  |   1   |     2 |    3  |   1   |     2 |    3  |   1   |
 55 |  *  |   1   |  222  |    3  |   1   |  222  |    3  |   1   |
 56 |  *  |   1   | 2     | 3   3 |   1   | 2     | 3   3 |   1   |
 57 |  *  |  111  | 22222 |  333  |  111  | 22222 |  333  |  111  |
 58 |  *   -------------------------------------------------------
 59 |  *
 60 |  * (the grid is 3 blocks' worth, so block 1 strides 3 blocks
 61 |  * from one sequence of indices it processes to the next.)
 62 |  * This is unlike `at_block_stride()`, for which instead
 63 |  * of 1, 2, 3, 1, 2, 3, 1 we would have 1, 1, 1, 2, 2, 2, 3
 64 |  * (or 1, 1, 2, 2, 3, 3, 4 if the grid has 4 blocks).
 65 |  *
 66 |  * @note assumes the number of grid threads is fixed (does that
 67 |  * always hold? even with dynamic parallelism?)
 68 |  *
 69 |  * @param length The length of the range (of integers) on which to act
 70 |  * @param f The callable to call for each element of the sequence.
 71 |  */
 72 | template <typename Function, typename Size = size_t>
 73 | KAT_FD void at_grid_stride(Size length, const Function& f)
 74 | {
 75 | 	auto num_grid_threads = grid::num_threads();
 76 | 	for(promoted_size_t<Size> pos = thread::global_id();
 77 | 		pos < length;
 78 | 		pos += num_grid_threads)
 79 | 	{
 80 | 		f(pos);
 81 | 	}
 82 | }
 83 | 
 84 | namespace warp_per_input_element {
 85 | 
 86 | /**
 87 |  * A variant of the one-position-per-thread applicator,
 88 |  * `collaborative::grid::at_grid_stride()`: Here each warp works on one
 89 |  * input position, advancing by 'grid stride' in the sense of total
 90 |  * warps in the grid.
 91 |  *
 92 |  * @note it is assumed the grid only has fully-active warps; any
 93 |  * possibly-inactive threads are not given consideration.
 94 |  *
 95 |  * @note This version of `at_grid_stride` is specific to linear grids,
 96 |  * even though the text of its code looks the same as that of
 97 |  * @ref kat::grid_info::collaborative::warp::at_grid_stride .
 98 |  *
 99 |  * @param length The length of the range of positions on which to act
100 |  * @param f The callable for warps to use each position in the sequence
101 |  */
102 | template <typename Function, typename Size = unsigned>
103 | KAT_FD void at_grid_stride(Size length, const Function& f)
104 | {
105 | 	auto num_warps_in_grid = grid_info::grid::num_warps();
106 | 	for(// _not_ the global thread index! - one element per warp
107 | 		promoted_size_t<Size> pos = grid_info::warp::global_id();
108 | 		pos < length;
109 | 		pos += num_warps_in_grid)
110 | 	{
111 | 		f(pos);
112 | 	}
113 | }
114 | 
115 | 
116 | } // namespace warp_per_input_element
117 | 
118 | 
119 | /**
120 |  * Have all grid threads perform some action over the linear range
121 |  * of 0..length-1, with each thread acting on a fixed number of items
122 |  * (@p the serialization_factor) at at stride of the block length,
123 |  * i.e. a thread with index i_t in
124 |  * block with index i_b, where block lengths are n_b,
125 |  * will perform the action on elements
126 |  *
127 |  *  n_b * i_b      * serialization_factor + i_t,
128 |  * (n_b * i_b + 1) * serialization_factor + i_t,
129 |  * (n_b * i_b + 2) * serialization_factor + i_t,
130 |  *
131 |  * and so on. For lengths which are not divisible by n_b *
132 |  * serialization_factor, threads in the last block will
133 |  * work on less items.
134 |  *
135 |  * Thus, if in the following chart the rectangles represent
136 |  * consecutive segments of n_b integers, the numbers
137 |  * indicate which blocks work on which elements in "block stride":
138 |  *
139 |  *   -------------------------------------------------------
140 |  *  |   1   |   1   |  222  |  222  |  333  |  333  |    4  |
141 |  *  |  11   |  11   | 2   2 | 2   2 | 3   3 | 3   3 |   44  |
142 |  *  |   1   |   1   |     2 |     2 |    3  |    3  |  4 4  |
143 |  *  |   1   |   1   |  222  |  222  |    3  |    3  | 4  4  |
144 |  *  |   1   |   1   | 2     | 2     | 3   3 | 3   3 | 44444 |
145 |  *  |  111  |  111  | 22222 | 22222 |  333  |  333  |    4  |
146 |  *   -------------------------------------------------------
147 |  *
148 |  * (A block strides from one blocks' worth of indices to the next.)
149 |  * This is unlike `at_grid_stride()`, for which instead
150 |  * of 1, 1, 2, 2, 3, 3, 4 we would have 1, 2, 3, 1, 2, 3, 1 (if the
151 |  * grid has 3 blocks) or 1, 2, 3, 4, 1, 2 (if the grid has 4 blocks).
152 |  *
153 |  * @note Theoretically, the @param serialization_factor value could be
154 |  * computed by this function itself. This is avoided, assuming that's
155 |  * been take care of before. Specifically, we assume that the
156 |  * @param serialization_factor is no higher than it absolutely
157 |  * must be.
158 |  *
159 |  * @note There's a block-level variant of this primitive, but there -
160 |  * each block applies f to the _same_ range of elements, rather than
161 |  * covering part of a larger range.
162 |  *
163 |  * @note This implementation does not handle cases of overflow of
164 |  * the @tparam Size type, e.g. if your Size is uint32_t and @param
165 |  * length is close to 2^32 - 1, the function may fail.
166 |  *
167 |  * @note There's a tricky tradeoff here between avoiding per-iteration
168 |  * checks for whether we're past the end, and avoiding too many
169 |  * initial checks. Two of the the template parameters help up avoid
170 |  * this tradeoff in certain cases by not having to check explicitly
171 |  * for things.
172 |  *
173 |  *
174 |  * @param length The length of the range (of integers) on which to act
175 |  * @param serialization_factor the number of elements each thread is to
176 |  * handle (serially)
177 |  * @param f The callable to execute for each element of the sequence.
178 |  *
179 |  */
180 | template <
181 | 	typename Function,
182 | 	typename Size = size_t,
183 | 	bool AssumeLengthIsDivisibleByBlockSize = false,
184 | 	bool GridMayFullyCoverLength = true,
185 | 	typename SerializationFactor = unsigned>
186 | KAT_FD void at_block_stride(
187 | 	Size                 length,
188 | 	const Function&      f,
189 | 	SerializationFactor  serialization_factor)
190 | {
191 | 	auto block_length = block::length();
192 | 	auto num_elements_to_process_by_each_block = serialization_factor * block_length;
193 | 	Size block_start_pos = num_elements_to_process_by_each_block * block::index();
194 | 	Size pos = block_start_pos + thread::index();
195 | 	if (pos >= length) { return; }
196 | 	auto in_last_acting_block = (block_start_pos + num_elements_to_process_by_each_block >= length);
197 | 		// Note: Be careful about overflow in this last line, if block_start_pos is close
198 | 		// to the maximum value of Size.
199 | 
200 | 	if (in_last_acting_block) {
201 | 		#pragma unroll
202 | 		for(; pos < length; pos += block_length) {
203 | 			f(pos);
204 | 		}
205 | 		return;
206 | 	}
207 | 	// If we're not in the last block which needs to take any action, we assume that we'll perform
208 | 	// full iterations and don't need to check for overstepping any bounds
209 | 	#pragma unroll
210 | 	for(SerializationFactor i = 0; i < serialization_factor; i++, pos += block_length) {
211 | 		f(pos);
212 | 	}
213 | }
214 | 
215 | } // namespace grid
216 | } // namespace collaborative
217 | } // namespace linear_grid
218 | } // namespace kat
219 | 
220 | #endif // CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_GRID_CUH_
221 | 


--------------------------------------------------------------------------------
/src/kat/on_device/common.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/common.cuh
  3 |  *
  4 |  * @brief Some basic type and constant definitions used by all
  5 |  * device-side CUDA KAT code
  6 |  *
  7 |  */
  8 | #pragma once
  9 | #ifndef CUDA_KAT_ON_DEVICE_COMMON_CUH_
 10 | #define CUDA_KAT_ON_DEVICE_COMMON_CUH_
 11 | 
 12 | #include <type_traits>
 13 | #include <climits> // for CHAR_BIT
 14 | #include <cuda_runtime_api.h>
 15 | 
 16 | ///@cond
 17 | #include <kat/detail/execution_space_specifiers.hpp>
 18 | ///@endcond
 19 | 
 20 | namespace kat {
 21 | 
 22 | /**
 23 |  * CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
 24 |  * In each of these, the numbers of blocks per grid is specified in this type.
 25 |  *
 26 |  * @note Theoretically, CUDA could split the type for blocks per grid and
 27 |  * threads per block, but for now they're the same.
 28 |  *
 29 |  * @note All three dimensions in dim3 are of the same type as dim3::x
 30 |  */
 31 | using grid_dimension_t = decltype(dim3::x);
 32 | 
 33 | /**
 34 |  * CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
 35 |  * In each of these, the number of threads per block is specified in this type.
 36 |  *
 37 |  * @note Theoretically, CUDA could split the type for blocks per grid and
 38 |  * threads per block, but for now they're the same.
 39 |  */
 40 | using grid_block_dimension_t  = grid_dimension_t;
 41 | 
 42 | using native_word_t = unsigned; // TODO: Make this uint32_t perhaps?
 43 | enum : native_word_t { warp_size = 32 };
 44 | enum : native_word_t { log_warp_size = 5 };
 45 | 
 46 | /**
 47 |  * @brief a size type no smaller than a native word.
 48 |  *
 49 |  * Sometimes, in device code, we only need our size type to cover a small
 50 |  * range of values; but - it is still more effective to use a full native word,
 51 |  * rather than to risk extra instructions to enforce the limits of
 52 |  * sub-native-word values. And while it's true this might not help much,
 53 |  * or be optimized away - let's be on the safe side anyway.
 54 |  */
 55 | template <typename Size>
 56 | using promoted_size_t = typename std::common_type<Size, native_word_t>::type;
 57 | 
 58 |  /**
 59 |   * A mask with one bit for each lane in a warp. Used to indicate which threads
 60 |   * meet a certain criterion or need to have some action applied to them.
 61 |   *
 62 |   * @todo: Consider using a 32-bit bit field
 63 |   */
 64 | using lane_mask_t = unsigned;
 65 | 
 66 | enum : lane_mask_t {
 67 | 	full_warp_mask  = 0xFFFFFFFF, //!< Bits turned on for all lanes in thw warp
 68 | 	empty_warp_mask = 0x0,        //!< Bits turned on for all lanes in thw warp
 69 | };
 70 | 
 71 | 
 72 | /**
 73 |  * The number bits in the representation of a value of type T.
 74 |  *
 75 |  * @note with this variant, you'll need to manually specify the type.
 76 |  */
 77 | template <typename T>
 78 | constexpr std::size_t size_in_bits() { return sizeof(T) * CHAR_BIT; }
 79 | 
 80 | //constexpr KAT_FHD bool operator==(const dim3& lhs, const dim3& rhs)
 81 | //{
 82 | //	return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
 83 | //}
 84 | 
 85 | /**
 86 |  * The number bits in the representation of a value of type T
 87 |  *
 88 |  * @note with this variant, the type will be deduced from the
 89 |  * object you pass.
 90 |  */
 91 | template <typename T>
 92 | constexpr std::size_t size_in_bits(const T&) { return sizeof(T) * CHAR_BIT; }
 93 | 
 94 | 
 95 | /*
 96 | namespace detail {
 97 | 
 98 | **
 99 |  * Use CUDA intrinsics where possible and relevant to reinterpret the bits
100 |  * of values of different types
101 |  *
102 |  * @param x[in]  the value to reinterpret. No references please!
103 |  * @return the reinterpreted value
104 |  *
105 | template <typename ToInterpret, typename Interpreted>
106 | KAT_FD  Interpreted reinterpret(
107 | 	typename std::enable_if<
108 | 		!std::is_same<
109 | 			typename std::decay<ToInterpret>::type, // I actually just don't want references here
110 | 			typename std::decay<Interpreted>::type>::value && // I actually just don't want references here
111 | 		sizeof(ToInterpret) == sizeof(Interpreted), ToInterpret>::type x)
112 | {
113 | 	return x;
114 | }
115 | 
116 | template<> KAT_FD double reinterpret<long long int, double>(long long int x) { return __longlong_as_double(x); }
117 | template<> KAT_FD long long int reinterpret<double, long long int>(double x) { return __double_as_longlong(x); }
118 | 
119 | template<> KAT_FD double reinterpret<unsigned long long int, double>(unsigned long long int x) { return __longlong_as_double(x); }
120 | template<> KAT_FD unsigned long long int reinterpret<double, unsigned long long int>(double x) { return __double_as_longlong(x); }
121 | 
122 | template<> KAT_FD float reinterpret<int, float>(int x) { return __int_as_float(x); }
123 | template<> KAT_FD int reinterpret<float, int>(float x) { return __float_as_int(x); }
124 | 
125 | } // namespace detail
126 | */
127 | 
128 | /**
129 |  * @note Interpreted can be either a value or a reference type.
130 |  *
131 |  * @todo Would it be better to return a reference?
132 |  */
133 | template<typename Interpreted, typename Original>
134 | KAT_FHD Interpreted reinterpret(Original& x)
135 | {
136 | 	return reinterpret_cast<Interpreted&>(x);
137 | }
138 | 
139 | } // namespace kat
140 | 
141 | #endif // CUDA_KAT_ON_DEVICE_COMMON_CUH_
142 | 


--------------------------------------------------------------------------------
/src/kat/on_device/detail/atomics/missing_in_cuda.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *
 3 |  * Copyright (c) 2019, Eyal Rozenberg <eyalroz@technion.ac.il>
 4 |  * All rights reserved.
 5 |  *
 6 |  * Redistribution and use in source and binary forms, with or without
 7 |  * modification, are permitted provided that the following conditions are met:
 8 |  *
 9 |  *  * Redistributions of source code must retain the above copyright notice,
10 |  *    this list of conditions and the following disclaimer.
11 |  *  * Redistributions in binary form must reproduce the above copyright
12 |  *    notice, this list of conditions and the following disclaimer in the
13 |  *    documentation and/or other materials provided with the distribution.
14 |  *  * Neither the name of CWI Amsterdam nor the names of its contributors may
15 |  *    be used to endorse or promote products derived from this software
16 |  *    without specific prior written permission.
17 |  *
18 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 |  * POSSIBILITY OF SUCH DAMAGE.
29 |  *
30 |  */
31 | #pragma once
32 | #ifndef CUDA_CUDA_KAT_ON_DEVICE_ATOMICS_MISSING_FROM_CUDA_CUH_
33 | #define CUDA_CUDA_KAT_ON_DEVICE_ATOMICS_MISSING_FROM_CUDA_CUH_
34 | 
35 | #include <device_atomic_functions.h>
36 | #include <kat/common.hpp>
37 | 
38 | static_assert(sizeof (unsigned long int) == sizeof(unsigned long long int), "Unexpected size of unsigned long int");
39 | 
40 | // Annoyingly, CUDA - upto and including version 10.2 - provide atomic
41 | // operation wrappers for unsigned int and unsigned long long int, but
42 | // not for the in-between type of unsigned long int. Also, some atomic
43 | // operations are provided for ints, i.e. not just for unsigned types,
44 | // but not consistently, i.e. int yes, long long int no, despite being
45 | // provided for unsigned long long int. So - we have to fill the gap.
46 | //
47 | // TODO: On CUDA devices, sizeof(long) is 8, like sizeof(long long). However,
48 | // that's not true on Windows host-side code. Need to double check this discrepancy
49 | // doesn't mess this code's correctness up somehow.
50 | 
51 | #define CUDA_KAT_DEFINE_MISSING_ATOMIC(arg_type, op) \
52 | KAT_FD arg_type atomic ## op(arg_type *address, arg_type val) \
53 | { \
54 | 	static_assert(sizeof(long) == sizeof(int) or sizeof(long) == sizeof(long long int), "Unexpected sizeof(long)"); \
55 | 	return (sizeof(arg_type) == sizeof(unsigned int)) ? \
56 | 		::atomic ## op(reinterpret_cast<unsigned int*>(address), reinterpret_cast<unsigned int&>(val)) : \
57 | 		::atomic ## op(reinterpret_cast<unsigned long long int*>(address), reinterpret_cast<arg_type&>(val)); \
58 | }
59 | 
60 | #define CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(op) \
61 | CUDA_KAT_DEFINE_MISSING_ATOMIC(unsigned long, op) \
62 | CUDA_KAT_DEFINE_MISSING_ATOMIC(long, op) \
63 | CUDA_KAT_DEFINE_MISSING_ATOMIC(long long, op)
64 | 
65 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Add)
66 | #if CUDA_ARCH >= 320
67 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(And)
68 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Or)
69 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Xor)
70 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Min)
71 | CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP(Max)
72 | #endif
73 | 
74 | #undef CUDA_KAT_DEFINE_MISSING_ATOMICS_FOR_OP
75 | #undef CUDA_KAT_DEFINE_MISSING_ATOMIC
76 | 
77 | #endif // CUDA_CUDA_KAT_ON_DEVICE_ATOMICS_MISSING_FROM_CUDA_CUH_
78 | 


--------------------------------------------------------------------------------
/src/kat/on_device/detail/itoa.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef KAT_ON_DEVICE_DETAIL_ITOA_CUH_
 2 | #define KAT_ON_DEVICE_DETAIL_ITOA_CUH_
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace kat {
 7 | namespace detail {
 8 | 
 9 | template <typename U> struct max_num_digits { };
10 | template <> struct max_num_digits<uint8_t > { static constexpr const unsigned value {  3 }; };
11 | template <> struct max_num_digits<uint16_t> { static constexpr const unsigned value {  5 }; };
12 | template <> struct max_num_digits<uint32_t> { static constexpr const unsigned value { 10 }; };
13 | template <> struct max_num_digits<uint64_t> { static constexpr const unsigned value { 20 }; };
14 | 
15 | template <typename I>
16 | inline KAT_DEV unsigned integer_to_string_reversed(I value, char* buffer)
17 | {
18 | 	bool append_minus {
19 | #pragma push
20 | #pragma diag_suppress = unsigned_compare_with_zero
21 | 		std::is_signed<I>::value and (value < 0)
22 | #pragma pop
23 | 	};
24 | 	value = builtins::absolute_value(value);
25 | 
26 |     char *reverse_ptr = buffer;
27 |     do {
28 |         *reverse_ptr++ = '0' + (value % 10);
29 |         value /= 10;
30 |     } while (value > 0);
31 | 
32 |     if (append_minus) { *reverse_ptr++ = '-'; }
33 |     return reverse_ptr - buffer;
34 | }
35 | 
36 | inline KAT_DEV char* copy_in_reverse(char* dst, const char* src, std::size_t length)
37 | {
38 |     for(auto i = 0; i < length; i++) {
39 |     	dst[i] = src[length - i - 1];
40 |     }
41 |     return dst;
42 | }
43 | 
44 | // This is not supposed to be optimal, just a straightforward short implementation
45 | template <typename I, bool WriteTermination = true>
46 | inline KAT_DEV unsigned integer_to_string(I value, char* buffer)
47 | {
48 | 	using unsigned_type = typename std::make_unsigned<I>::type;
49 |     char reverse_buffer[max_num_digits<unsigned_type>::value];
50 |     auto length = integer_to_string_reversed<I>(value, reverse_buffer);
51 |     copy_in_reverse(buffer, reverse_buffer, length);
52 |     if (WriteTermination) { buffer[length] = '\0'; }
53 |     return length;
54 | }
55 | 
56 | } // namespace detail
57 | } // namespace kat
58 | 
59 | 
60 | #endif // KAT_ON_DEVICE_DETAIL_ITOA_CUH_
61 | 


--------------------------------------------------------------------------------
/src/kat/on_device/math.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/math.cuh
  3 |  *
  4 |  * @brief Templatized mathematical function definitions for integer and floating-point types.
  5 |  *
  6 |  * CUDA has many mathematical primitives - which are already found in @ref `builtins.cuh`.
  7 |  * However, they are often not defined for all types; and - some functions are missing
  8 |  * (e.g. @ref `gcd()`) or can benefit from specialization (e.g. division by a power of 2).
  9 |  * This file has the wider selection of functions, utilizing a primitive (from `builtins::`)
 10 |  * when relevant, and multi-instruction implementation otherwise.
 11 |  *
 12 |  * @note Including this file is sufficient for accessing all functions in
 13 |  * @ref `constexpr_math.cuh`.
 14 |  */
 15 | #pragma once
 16 | #ifndef CUDA_KAT_ON_DEVICE_MATH_CUH_
 17 | #define CUDA_KAT_ON_DEVICE_MATH_CUH_
 18 | 
 19 | #include "common.cuh"
 20 | #include "constexpr_math.cuh"
 21 | #include <kat/on_device/builtins.cuh>
 22 | 
 23 | 
 24 | ///@cond
 25 | #include <kat/detail/execution_space_specifiers.hpp>
 26 | ///@endcond
 27 | 
 28 | #include <type_traits>
 29 | 
 30 | namespace kat {
 31 | 
 32 | /**
 33 |  * @brief compute the base-two logarithm of a number known to be a power of 2.
 34 |  *
 35 |  * @note Yes, this is trivial to do, but:
 36 |  *   1. This says _what_ you're doing, not _how_ you do it (e.g. left-shifting
 37 |  *      bits and such)
 38 |  *   2. There's a device-side optimization here (which isn't constexpr)
 39 |  *
 40 |  * @param p an integral power of 2
 41 |  * @return the exponent l such than 2^l equals p
 42 |  */
 43 | template <typename I>
 44 | KAT_FD unsigned log2_of_power_of_2(I p)
 45 | {
 46 | 	static_assert(std::is_integral<I>::value, "Only supported for integers");
 47 | 	// Remember 0 is _not_ a power of 2.
 48 | 	return  builtins::population_count(p - 1);
 49 | }
 50 | 
 51 | /**
 52 |  * A variant of `div_rounding_up` (which you can find in `constexpr_math.cuh`),
 53 |  * which has (non-constexpr, unfortunately) optimizations based on the knowledge
 54 |  * the divisor is a power of 2
 55 |  *
 56 |  * @return The smallest multiple of divisor above dividend / divisor
 57 |  */
 58 | template <typename T, typename S>
 59 | KAT_FD T div_by_power_of_2_rounding_up(const T& dividend, const S& divisor)
 60 | {
 61 | 	auto mask = divisor - 1; // Remember: 0 is _not_ a power of 2
 62 | 	auto log_2_of_divisor = log2_of_power_of_2(divisor);
 63 | 	auto correction_for_rounding_up = (dividend & mask != 0);
 64 | 
 65 | 	return (dividend >> log_2_of_divisor) + correction_for_rounding_up;
 66 | }
 67 | 
 68 | 
 69 | template <typename I, typename P>
 70 | constexpr KAT_FD I div_by_power_of_2(I dividend, P power_of_2)
 71 | {
 72 | 	return dividend >> log2_of_power_of_2(power_of_2);
 73 | }
 74 | 
 75 | 
 76 | 
 77 | #if __cplusplus < 201402L
 78 | /**
 79 |  * @brief compute the greatest common divisor (gcd) of two values.
 80 |  *
 81 |  * @param u One integral value (prefer making this the larger one)
 82 |  * @param v Another integral value (prefer making this the smaller one)
 83 |  * @return the largest I value d such that d divides @p u and d divides @p v.
 84 |  */
 85 | template <typename T>
 86 | constexpr KAT_FD T gcd(T u, T v)
 87 | {
 88 | 	static_assert(std::is_integral<I>::value, "Only supported for integers");
 89 | 	while (v != 0) {
 90 | 		T r = u % v;
 91 | 		u = v;
 92 | 		v = r;
 93 | 	}
 94 | 	return u;
 95 | }
 96 | // ... and for C++14, this is a constexpr_ implementation, and we don't need to redo it here
 97 | #endif
 98 | 
 99 | /**
100 |  * @brief compute the least common multiple (LCM) of two integer values
101 |  *
102 |  * @tparam I an integral (or integral-number-like) type
103 |  *
104 |  * @param u One of the numbers which the result must divide
105 |  * @param v Another one of the numbers which the result must divide
106 |  * @return The highest I value which divides both @p u and @p v.
107 |  */
108 | template <typename I>
109 | KAT_FD I lcm(I u, I v)
110 | {
111 | 	static_assert(std::is_integral<I>::value, "Only supported for integers at the moment");
112 | 	return (u / gcd(u,v)) * v;
113 | }
114 | 
115 | namespace detail {
116 | 
117 | 
118 | template <typename I> KAT_FD int count_leading_zeros(I x)
119 | {
120 | 	static_assert(std::is_integral<I>::value, "Only integral types are supported");
121 | 	static_assert(sizeof(I) <= sizeof(long long), "Unexpectedly large type");
122 | 
123 | 	using native_clz_type =
124 | 		typename std::conditional< sizeof(I) <= sizeof(int), int, long long >::type;
125 | 	enum : int { width_difference_in_bits = (sizeof(native_clz_type) - sizeof(I)) * CHAR_BIT };
126 | 	return builtins::count_leading_zeros<native_clz_type>(static_cast<native_clz_type>(x)) - width_difference_in_bits;
127 | }
128 | 
129 | }
130 | 
131 | /**
132 |  * @brief compute the (integral) base-two logarithm of a number
133 |  *
134 |  * @note Yes, this is trivial to do, but:
135 |  *   1. This says _what_ you're doing, not _how_ you do it (e.g. left-shifting
136 |  *      bits and such)
137 |  *   2. There's a device-side optimization here (which isn't constexpr)
138 |  *
139 |  * @param x a non-negative value
140 |  * @return floor(log2(x)), i.e. the least exponent l such than 2^l >= x
141 |  */
142 | template <typename I>
143 | KAT_FD unsigned log2(I x) {
144 | 	assert(x > 0);
145 | 	return I{CHAR_BIT * sizeof(I) - I{1} } - detail::count_leading_zeros<I>(x);
146 | }
147 | 
148 | namespace detail {
149 | 
150 | template <typename T> KAT_FD T minimum(std::integral_constant<bool, false>, T x, T y)
151 | {
152 | 	return x < y ? x : y;
153 | }
154 | 
155 | template <typename T> KAT_FD T minimum(std::integral_constant<bool, true>, T x, T y)
156 | {
157 | 	return builtins::minimum(x, y);
158 | }
159 | 
160 | 
161 | template <typename T> KAT_FD T maximum(std::integral_constant<bool, false>, T x, T y)
162 | {
163 | 	return x > y ? x : y;
164 | }
165 | 
166 | template <typename T> KAT_FD T maximum(std::integral_constant<bool, true>, T x, T y)
167 | {
168 | 	return builtins::maximum(x, y);
169 | }
170 | 
171 | template <typename T> KAT_FD T absolute_value(std::integral_constant<bool, false>, T x)
172 | {
173 | 	return (std::is_unsigned<T>::value or x >= 0) ? x : -x;
174 | }
175 | 
176 | template <typename T> KAT_FD T absolute_value(std::integral_constant<bool, true>, T x)
177 | {
178 | 	return builtins::absolute_value(x);
179 | }
180 | 
181 | } // namespace detail
182 | 
183 | template <typename T> KAT_FD T minimum(T x, T y)
184 | {
185 | 	// TODO: Check at compile-time whether the builtin is instantiated or not - without duplication the list of types here
186 | 	return detail::minimum(std::integral_constant<bool,
187 | 		std::is_same<T, int                >::value or
188 | 		std::is_same<T, unsigned int       >::value or
189 | 		std::is_same<T, long               >::value or
190 | 		std::is_same<T, unsigned long      >::value or
191 | 		std::is_same<T, long long          >::value or
192 | 		std::is_same<T, unsigned long long >::value or
193 | 		std::is_same<T, float              >::value or
194 | 		std::is_same<T, double             >::value>{},
195 | 		x, y);
196 | }
197 | 
198 | template <typename T> KAT_FD T maximum(T x, T y)
199 | {
200 | 	// TODO: Check at compile-time whether the builtin is instantiated or not - without duplication the list of types here
201 | 	return detail::maximum(std::integral_constant<bool,
202 | 		std::is_same<T, int                >::value or
203 | 		std::is_same<T, unsigned int       >::value or
204 | 		std::is_same<T, long               >::value or
205 | 		std::is_same<T, unsigned long      >::value or
206 | 		std::is_same<T, long long          >::value or
207 | 		std::is_same<T, unsigned long long >::value or
208 | 		std::is_same<T, float              >::value or
209 | 		std::is_same<T, double             >::value>{},
210 | 		x, y);
211 | }
212 | 
213 | template <typename T> KAT_FD T absolute_value(T x)
214 | {
215 | 	// TODO: Check at compile-time whether the builtin is instantiated or not - without duplication the list of types here
216 | 	return detail::absolute_value(std::integral_constant<bool,
217 | 		std::is_unsigned<T>::value or
218 | 		std::is_same<T, int                >::value or
219 | 		std::is_same<T, long               >::value or
220 | 		std::is_same<T, long long          >::value or
221 | 		std::is_same<T, float              >::value or
222 | 		std::is_same<T, double             >::value>{},
223 | 		x);
224 | }
225 | 
226 | } // namespace kat
227 | 
228 | #endif // CUDA_KAT_ON_DEVICE_MATH_CUH_
229 | 


--------------------------------------------------------------------------------
/src/kat/on_device/miscellany.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file kat/on_device/miscellany.cuh
  3 |  *
  4 |  * @brief Miscellaneous functions provided by cuda-kat which are not a good
  5 |  * fit in any other header.
  6 |  */
  7 | #pragma once
  8 | #ifndef CUDA_KAT_ON_DEVICE_MISCELLANY_CUH_
  9 | #define CUDA_KAT_ON_DEVICE_MISCELLANY_CUH_
 10 | 
 11 | #include "common.cuh"
 12 | #include <kat/detail/pointers.cuh>
 13 | 
 14 | #include <type_traits>
 15 | #include <limits>
 16 | #include <cassert>
 17 | 
 18 | ///@cond
 19 | #include <kat/detail/execution_space_specifiers.hpp>
 20 | ///@endcond
 21 | 
 22 | namespace kat {
 23 | 
 24 | namespace detail {
 25 | 
 26 | template <bool Signed, std::size_t NumBits> struct integer_type_struct;
 27 | template <> struct integer_type_struct<false,  8> { using type = std::uint8_t;  };
 28 | template <> struct integer_type_struct<false, 16> { using type = std::uint16_t; };
 29 | template <> struct integer_type_struct<false, 32> { using type = std::uint32_t; };
 30 | template <> struct integer_type_struct<false, 64> { using type = std::uint64_t; };
 31 | template <> struct integer_type_struct<true,   8> { using type = std::int8_t;   };
 32 | template <> struct integer_type_struct<true,  16> { using type = std::int16_t;  };
 33 | template <> struct integer_type_struct<true,  32> { using type = std::int32_t;  };
 34 | template <> struct integer_type_struct<true,  64> { using type = std::int64_t;  };
 35 | 
 36 | // TODO: Consider pushing these types upwards into kat:: proper.
 37 | 
 38 | /**
 39 |  * A templating by size of the signed integer types
 40 |  */
 41 | template <std::size_t NumBits>
 42 | using int_t = typename detail::integer_type_struct<true, NumBits>::type;
 43 | 
 44 | /**
 45 |  * A templating by size of the unsigned integer types
 46 |  */
 47 | template <std::size_t NumBits>
 48 | using uint_t = typename detail::integer_type_struct<false, NumBits>::type;
 49 | 
 50 | 
 51 | /**
 52 |  * @note Assumes num_elements_to_copy > 0 and the same misalignment of the source
 53 |  * and destination w.r.t. native words.
 54 |  */
 55 | KAT_FD void copy(
 56 | 	uint32_t*        __restrict__  destination,
 57 | 	const uint32_t*  __restrict__  source,
 58 | 	std::size_t                    num_elements_to_copy)
 59 | {
 60 | 	while (num_elements_to_copy-- > 0) {
 61 | 		*(destination++) = *(source++);
 62 | 	}
 63 | }
 64 | 
 65 | /**
 66 |  * @note Assumes num_elements_to_copy > 0 and the same misalignment of the source
 67 |  * and destination w.r.t. native words.
 68 |  */
 69 | KAT_FD void copy(
 70 | 	uint16_t*        __restrict__  destination,
 71 | 	const uint16_t*  __restrict__  source,
 72 | 	std::size_t                    num_elements_to_copy)
 73 | {
 74 | 	bool got_non_word_head = not is_aligned<native_word_t>(destination);
 75 | 	if (got_non_word_head) {
 76 | 		*(destination++) = *(source++);
 77 | 		num_elements_to_copy--;
 78 | 	}
 79 | 	auto num_words_to_copy =
 80 | 		num_elements_to_copy / ((sizeof(native_word_t) / sizeof(uint16_t)));
 81 | 		// ... so, half as many words as elements;
 82 | 	detail::copy(
 83 | 		reinterpret_cast<native_word_t*>(destination),
 84 | 		reinterpret_cast<const native_word_t*>(source),
 85 | 		num_words_to_copy
 86 | 	);
 87 | 	bool got_non_word_tail = not is_aligned<native_word_t>(destination + num_elements_to_copy);
 88 | 	if (got_non_word_tail) {
 89 | 		destination[num_elements_to_copy - 1] = source[num_elements_to_copy - 1];
 90 | 	}
 91 | }
 92 | 
 93 | /**
 94 |  * @note Assumes num_elements_to_copy > 0 and the same misalignment of the source
 95 |  * and destination w.r.t. native words.
 96 |  */
 97 | KAT_FD void copy(
 98 | 	uint8_t*        __restrict__  destination,
 99 | 	const uint8_t*  __restrict__  source,
100 | 	std::size_t                   num_elements_to_copy)
101 | {
102 | 	// TODO: Improve this implementation to use native-word copies as much as possible, just like the 2-byte case
103 | 	if (num_elements_to_copy > 0) {
104 | 		::memcpy(destination, source, num_elements_to_copy * sizeof(uint8_t));
105 | 	}
106 | }
107 | 
108 | } // namespace detail
109 | 
110 | /**
111 |  * Copies some data from one location to another - using the native register
112 |  * size for individual elements on CUDA GPUs, i.e. sizeof(int) = 4
113 |  *
114 |  * @note CUDA's own general-purpose memcpy() takes void pointers and uses a u8 (byte)
115 |  * LD-ST loop. See: @url https://godbolt.org/z/9ChTPM ; this LD-ST's using the native
116 |  * register size, 4 bytes, if possible.
117 |  *
118 |  * @note this function assumes appropriate alignment.
119 |  *
120 |  * @note Instead of using this function, you're probably better off using a warp-level
121 |  * or block-level primitive for copying data.
122 |  *
123 |  * @param destination Destination of the copy. Must have at least
124 |  * 4 (@p num_elements_to_copy} bytes allocated. Data must be self-aligned, i.e. the
125 |  * numeric value of this parameter must be divisible by sizeof(T).
126 |  * @param source The beginning of the memory region from which to copy.
127 |  * There must be sizeof(T) * {@p num_elements_to_copy} bytes readable starting with
128 |  * this address. Data must be self-aligned, i.e. the numeric value of this parameter
129 |  * must be divisible by sizeof(T).
130 |  * @param num_elements_to_copy the number of elements of data to copy - not their
131 |  * total size in bytes!
132 |  * @return the destination pointer
133 |  */
134 | template <typename T, bool AssumeSameAlignmentWithinWord = false>
135 | KAT_FD T* copy(
136 | 	T*        __restrict__  destination,
137 | 	const T*  __restrict__  source,
138 | 	std::size_t             num_elements_to_copy)
139 | {
140 | 	// This function uses the native word size explicitly in a few places, so:
141 | 	static_assert(sizeof(native_word_t) == sizeof(uint32_t), "unexpected size of native word");
142 | 
143 | 	if (not std::is_trivially_copyable<T>::value) {
144 | 		// Can't optimize, must use T::operator=.
145 | 		for(std::size_t i = 0; i < num_elements_to_copy; i++) {
146 | 			destination[i] = source[i];
147 | 		}
148 | 		return destination;
149 | 	}
150 | 
151 | 	if (not AssumeSameAlignmentWithinWord) {
152 | 		auto source_misalignent_in_bytes = detail::misalignment_extent<native_word_t>(source);
153 | 		auto destination_misalignent_in_bytes = detail::misalignment_extent<native_word_t>(destination);
154 | 		if (source_misalignent_in_bytes != destination_misalignent_in_bytes) {
155 | 			// Since the alignments don't match, any read-and-write operation pair
156 | 			// will be unaligned - unless we work on individual bytes.
157 | 
158 | 			if (num_elements_to_copy > 0) {
159 | 				::memcpy(destination, source, num_elements_to_copy * sizeof(T));
160 | 			}
161 | 			return destination;
162 | 			// ... but actually the above claim is not true, for the case of 2-byte size_mod;
163 | 			// if the alignments are, say, 0 and 2 or 3 and 1 then we can at least use a
164 | 			// loop over 2-byte copying. TODO: Implement that.
165 | 		}
166 | 	}
167 | 
168 | 	if (num_elements_to_copy == 0) {
169 | 		return destination;
170 | 	}
171 | 
172 | 	constexpr const auto size_mod_in_bytes { sizeof(T) % sizeof(native_word_t) };
173 | 	constexpr const auto size_gcd_of_T_and_native_word {
174 | 		size_mod_in_bytes  == 4 ? 0 : (size_mod_in_bytes == 2 ? 2 : 1)
175 | 	};
176 | 	using copy_unit_type = detail::uint_t<size_gcd_of_T_and_native_word * CHAR_BIT>;
177 | 	auto num_copy_unit_elements_to_copy = num_elements_to_copy * sizeof(T) / sizeof(copy_unit_type);
178 | 
179 | 	detail::copy(
180 | 		reinterpret_cast<copy_unit_type*>(destination),
181 | 		reinterpret_cast<const copy_unit_type*>(source),
182 | 		num_copy_unit_elements_to_copy
183 | 	);
184 | 	return destination;
185 | }
186 | 
187 | /**
188 |  * @brief Return the number of full warps in a linear grid
189 |  * which would, overall, contain at least a given number of threads.
190 |  *
191 |  * @note This comes in handy more times than you must expect even in device-side code.
192 |  *
193 |  * @note the reason this function is defined directly rather than using
194 |  * the functions in math or constexpr_math is that bit-counting is
195 |  * either slow in run-time on the GPUwhen you use the constexpr way of
196 |  * doing it, or not constexpr if you use the GPU-side population count
197 |  * instruction.
198 |  */
199 | template <typename I>
200 | constexpr KAT_FHD I num_warp_sizes_to_cover(I number_of_threads)
201 | {
202 | 	static_assert(std::is_integral<I>::value, "Number of threads specified using a non-integral type");
203 | 	enum : I { mask = (warp_size - 1) };
204 | 	enum : I { log_warp_size = 5 } ;
205 | 	return (number_of_threads >> log_warp_size) + ((number_of_threads & mask) != 0);
206 | }
207 | 
208 | } // namespace kat
209 | 
210 | #endif // CUDA_KAT_ON_DEVICE_MISCELLANY_CUH_
211 | 


--------------------------------------------------------------------------------
/src/kat/on_device/non-builtins.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file on_device/non-builtins.cuh
 3 |  *
 4 | * @brief Templated, uniformly-named C++ functions wrapping what should
 5 | * have been single PTX - but aren't (in a dedicated `non_builtins` namespace).
 6 | *
 7 | * There are several functions one would expect would compile to single PTX
 8 | * instructions (Similar ones _do_ compile to single PTX instructions,
 9 | * and on the CPU, they themselves often translate to a single machine
10 | * instruction) - but strangely, they do not. Implementations of such functions
11 | * are found in this file rather than in @ref `on_device/builtins.cuh`; and they
12 | * get a different namespace to avoid accidental confusion.
13 | *
14 | */
15 | #ifndef CUDA_KAT_ON_DEVICE_NON_BUILTINS_CUH_
16 | #define CUDA_KAT_ON_DEVICE_NON_BUILTINS_CUH_
17 | 
18 | #include <kat/on_device/builtins.cuh>
19 | 
20 | 
21 | ///@cond
22 | #include <kat/detail/execution_space_specifiers.hpp>
23 | ///@endcond
24 | 
25 | namespace kat {
26 | namespace non_builtins {
27 | 
28 | /**
29 |  * @brief Determine the 1-based index of the first non-zero bit in the argument.
30 |  *
31 |  * @param x the value to be considered as a container of bits
32 |  * @return If @p x is 0, returns 0; otherwise, returns the 1-based index of the
33 |  * first non-zero bit in @p x
34 |  */
35 | template <typename I> KAT_FD int find_first_set(I x)
36 | {
37 | 	static_assert(std::is_integral<I>::value, "Only integral types are supported");
38 | 	static_assert(sizeof(I) <= sizeof(long long), "Unexpectedly large type");
39 | 
40 | 	using ffs_type = typename std::conditional< sizeof(I) <= sizeof(int), int, long long >::type;
41 | 	return find_first_set<ffs_type>(x);
42 | }
43 | template <> KAT_FD int find_first_set< int               >(int                x) { return __ffs(x);                       }
44 | template <> KAT_FD int find_first_set< long long         >(long long          x) { return __ffsll(x);                     }
45 | 
46 | /**
47 |  * @brief counts the number of initial zeros when considering the binary representation
48 |  * of a number from least to most significant digit
49 |  * 
50 |  * @tparam FixSemanticsForZero the simpler implementation of this function uses the
51 |  * @ref `find_first_set()` builtin. Unfortunately, that one returns -1 rather than 0
52 |  * if no bits are set. Fixing this requires a couple of extra instructions. By default,
53 |  * we'll use them, but one might be interested just skipping them and taking -1
54 |  * instead of 32 (= warp_size) for the no-1's case.
55 |  *
56 |  * @param x the number whose binary representation is to be counted
57 |  * @return the number of initial zero bits before the first 1; if x is 0, the full
58 |  * number of bits is returned (or -1, depending on @tparam FixSemanticsForZero).
59 |  */
60 | template <typename I, bool FixSemanticsForZero = true>
61 | KAT_FD int count_trailing_zeros(I x)
62 | {
63 | 	if (FixSemanticsForZero and x == 0) {
64 | 		return size_in_bits<I>();
65 | 	}
66 | 	return find_first_set<I>(x) - 1;
67 | }
68 | 
69 | /**
70 |  * @brief counts the number of initial zeros when considering the binary representation
71 |  * of a number from most to least significant digit
72 |  * @param x the number whose representation is to be counted
73 |  * @return the counted number of 0 bits; if x is 0, 32 is returned
74 |  */
75 | template <typename I> KAT_FD int count_leading_zeros(I x)
76 | {
77 | 	static_assert(std::is_integral<I>::value, "Only integral types are supported");
78 | 	static_assert(sizeof(I) <= sizeof(long long), "Unexpectedly large type");
79 | 
80 | 	using native_clz_type =
81 | 		typename std::conditional< sizeof(I) <= sizeof(int), int, long long >::type;
82 | 	enum : int { width_difference_in_bits = (sizeof(native_clz_type) - sizeof(I)) * CHAR_BIT };
83 | 	return builtins::count_leading_zeros<native_clz_type>(static_cast<native_clz_type>(x)) - width_difference_in_bits;
84 | }
85 | 
86 | } // namespace non_builtins
87 | } // namespace kat
88 | 
89 | #endif // CUDA_KAT_ON_DEVICE_NON_BUILTINS_CUH_
90 | 


--------------------------------------------------------------------------------
/src/kat/on_device/ptx.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file on_device/ptx.cuh
 3 |  *
 4 |  * @brief Wrapper functions for single PTX instructions --- using inline PTX
 5 |  * assembly --- which are not already available in the official CUDA includes
 6 |  *
 7 |  * CUDA provides many "intrinsics" functions, which wrap single PTX instructions,
 8 |  * e.g. `__ldg` or `__funnelshift_l` from `sm_32_intrinsics.h`. But - CUDA
 9 |  * doesn't provide such functions for all of the PTX instruction set. The
10 |  * files included from this master-include contain such single-line assembly
11 |  * wrapper functions for different categories of missing PTX instructions.
12 |  *
13 |  * @note Unlike @ref `on_device/builtins.cuh`, functions here are not
14 |  * templated, and do not necessarily have the same name for different
15 |  * parameter types. `on_device/builtins.cuh` functions do _use_ PTX wrapper
16 |  * functions as their implementation.
17 |  */
18 | 
19 | #pragma once
20 | #ifndef CUDA_KAT_ON_DEVICE_PTX_CUH_
21 | #define CUDA_KAT_ON_DEVICE_PTX_CUH_
22 | 
23 | #if ! __CUDA_ARCH__ >= 300
24 | #error "This code can only target devices of compute capability 3.0 or higher."
25 | #endif
26 | 
27 | namespace kat {
28 | 
29 | /**
30 |  * @brief Code exposing CUDA's PTX intermediate representation instructions
31 |  * to C++ code.
32 |  *
33 |  * With CUDA, device-side code is compiled from a C++-like language to an
34 |  * intermediate representation (IR), which is not supported directly by any
35 |  * GPU, but from which it is easy to compile.
36 |  *
37 |  * Occasionally, a developer wants to use a specific PTX instruction - e.g.
38 |  * to optimize some code. CUDA's headers expose some of the opcodes for these
39 |  * instructions - but not all of them. Also, the exposed instructions are
40 |  * not templated on the arguments - while PTX instructions _are_ thus
41 |  * templated. These two gaps are filled by this library.
42 |  */
43 | namespace ptx { }
44 | 
45 | } // namespace kat
46 | 
47 | #include "ptx/special_registers.cuh"
48 | #include "ptx/miscellany.cuh"
49 | #include "ptx/video_instructions.cuh"
50 | 
51 | #endif // CUDA_KAT_ON_DEVICE_PTX_CUH_
52 | 


--------------------------------------------------------------------------------
/src/kat/on_device/ptx/detail/define_macros.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Notes:
 3 |  *
 4 |  * - Prefer including ../ptx.cuh rather than this file directly
 5 |  * - Including this file "pollutes" the rest of your code with preprocessor
 6 |  *   includes You may not want. To get rid of them, include
 7 |  *   ptx_clear_macros.cuh afterwards
 8 |  */
 9 | 
10 | #ifndef PTX_UTILITY_MACROS_DEFINED
11 | #define PTX_UTILITY_MACROS_DEFINED
12 | 
13 | #include <cstdint>     // for uintXX_t types
14 | 
15 | #define PTX_STRINGIFY(_q) #_q
16 | 
17 | // Mnemonic: "h" for half, "r" for regular, "l" for long, "f" and "d" for float and double
18 | #define SIZE_CONSTRAINT_s16 "h"
19 | #define SIZE_CONSTRAINT_u16 "h"
20 | #define SIZE_CONSTRAINT_s32 "r"
21 | #define SIZE_CONSTRAINT_u32 "r"
22 | #define SIZE_CONSTRAINT_s64 "l"
23 | #define SIZE_CONSTRAINT_u64 "l"
24 | #define SIZE_CONSTRAINT_f32 "f"
25 | #define SIZE_CONSTRAINT_f64 "d"
26 | 
27 | /*
28 |  *  In PTX inline assembly, every variable name must be preceded by a string indicating its size.
29 |  *  Why, would you ask - if the variable _has_ a size which the compiler knows? Just because.
30 |  *  This maps your PTX-style type to its appropriate size indicator string
31 |  */
32 | #define SIZE_CONSTRAINT(ptx_value_type) SIZE_CONSTRAINT_ ## ptx_value_type
33 | 
34 | /*
35 |  * Always use this as (part of) the
36 |  * constraint string for pointer arguments to PTX inline assembly instructions
37 |  * (see http://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints)
38 |  */
39 | #if defined(_WIN64) || defined(__LP64__)
40 | #define PTR_SIZE_CONSTRAINT SIZE_CONSTRAINT(u64)
41 | #else
42 | #define PTR_SIZE_CONSTRAINT SIZE_CONSTRAINT(u32)
43 | #endif
44 | 
45 | #define CPP_TYPE_BY_PTX_TYPE_s16 int16_t
46 | #define CPP_TYPE_BY_PTX_TYPE_s32 int32_t
47 | #define CPP_TYPE_BY_PTX_TYPE_s64 int64_t
48 | #define CPP_TYPE_BY_PTX_TYPE_u16 uint16_t
49 | #define CPP_TYPE_BY_PTX_TYPE_u32 uint32_t
50 | #define CPP_TYPE_BY_PTX_TYPE_u64 uint64_t
51 | #define CPP_TYPE_BY_PTX_TYPE_f32 float
52 | #define CPP_TYPE_BY_PTX_TYPE_f64 double
53 | 
54 | /*
55 |  * In our PTX wrapper, we need to declare function parameters and local variables
56 |  * basedon PTX-style types; this is a mechanism for obtaining the corresponding C++
57 |  * type (at the preprocessor level).
58 |  */
59 | #define CPP_TYPE_BY_PTX_TYPE(ptx_value_type) CPP_TYPE_BY_PTX_TYPE_ ## ptx_value_type
60 | 
61 | #define MAKE_UNSIGNED_s16 u16
62 | #define MAKE_UNSIGNED_s32 u32
63 | #define MAKE_UNSIGNED_s64 u64
64 | #define MAKE_UNSIGNED_u16 u16
65 | #define MAKE_UNSIGNED_u32 u32
66 | #define MAKE_UNSIGNED_u64 u64
67 | 
68 | /*
69 |  * This converts specifiers of signed PTX types into their unsigned equivalent,
70 |  * textually.
71 |  */
72 | #define MAKE_UNSIGNED(ptx_value_type) MAKE_UNSIGNED_ ## ptx_value_type
73 | 
74 | #endif // PTX_UTILITY_MACROS_DEFINED
75 | 
76 | 


--------------------------------------------------------------------------------
/src/kat/on_device/ptx/detail/undefine_macros.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Notes:
 3 |  *
 4 |  * - Try not to use this file directly, but rather just ../ptx.cuh
 5 |  * - If you are using this file directly - include it after having
 6 |  *   included utilities.cuh - and any other include files which
 7 |  *   uses these utilities.
 8 |  */
 9 | 
10 | #ifdef PTX_UTILITY_MACROS_DEFINED
11 | 
12 | #undef PTR_SIZE_CONSTRAINT
13 | #undef CPLUSPLUS_VARIABLE_TYPE
14 | #undef CPLUSPLUS_VARIABLE_TYPE_u16
15 | #undef CPLUSPLUS_VARIABLE_TYPE_u32
16 | #undef CPLUSPLUS_VARIABLE_TYPE_u64
17 | #undef CPLUSPLUS_VARIABLE_TYPE_f32
18 | #undef CPLUSPLUS_VARIABLE_TYPE_f64
19 | #undef SIZE_CONSTRAINT
20 | #undef SIZE_CONSTRAINT_u16
21 | #undef SIZE_CONSTRAINT_u32
22 | #undef SIZE_CONSTRAINT_u64
23 | #undef SIZE_CONSTRAINT_f32
24 | #undef SIZE_CONSTRAINT_f64
25 | 
26 | #undef PTX_STRINGIFY
27 | 
28 | #undef PTX_UTILITY_MACROS_DEFINED
29 | 
30 | #endif // PTX_UTILITY_MACROS_DEFINED
31 | 


--------------------------------------------------------------------------------
/src/kat/on_device/ptx/miscellany.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file ptx/miscellany.cuh Non-templated wrappers for PTX instructions, which nVIDIA
  3 |  * does not provide wrappers for through the CUDA `<device_functions.h>` header.
  4 |  */
  5 | #pragma once
  6 | #ifndef CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_
  7 | #define CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_
  8 | 
  9 | #include "detail/define_macros.cuh"
 10 | #include <kat/on_device/common.cuh>
 11 | 
 12 | #include <cstdint>
 13 | #include <cassert>
 14 | #include <type_traits>
 15 | 
 16 | 
 17 | ///@cond
 18 | #include <kat/detail/execution_space_specifiers.hpp>
 19 | ///@endcond
 20 | 
 21 | namespace kat {
 22 | 
 23 | namespace ptx {
 24 | 
 25 | /**
 26 |  * @brief Aborts execution (of the entire kernel grid) and generates an interrupt to the host CPU.
 27 |  */
 28 | KAT_FD  void trap()
 29 | {
 30 | 	asm("trap;");
 31 | }
 32 | 
 33 | /**
 34 |  * Ends execution of the current thread of this kernel/grid
 35 |  */
 36 | KAT_FD void exit()
 37 | {
 38 | 	asm("exit;");
 39 | }
 40 | 
 41 | /**
 42 |  * See <a href="http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep">relevant section</a>
 43 |  * of the CUDA PTX reference for details on these instructions.
 44 |  */
 45 | #define DEFINE_IS_IN_MEMORY_SPACE(_which_space) \
 46 | KAT_FD int32_t is_in_ ## _which_space ## _memory (const void *ptr) \
 47 | { \
 48 | 	int32_t result; \
 49 | 	asm ("{\n\t" \
 50 | 		".reg .pred p;\n\t" \
 51 | 		"isspacep." PTX_STRINGIFY(_which_space) " p, %1;\n\t" \
 52 | 		"selp.b32 %0, 1, 0, p;\n\t" \
 53 | 		"}" \
 54 | 		: "=r"(result) : PTR_SIZE_CONSTRAINT(ptr)); \
 55 | 	return result; \
 56 | }
 57 | 
 58 | DEFINE_IS_IN_MEMORY_SPACE(const)  // is_in_const_memory
 59 | DEFINE_IS_IN_MEMORY_SPACE(global) // is_in_global_memory
 60 | DEFINE_IS_IN_MEMORY_SPACE(local)  // is_in_local_memory
 61 | DEFINE_IS_IN_MEMORY_SPACE(shared) // is_in_shared_memory
 62 | 
 63 | #undef DEFINE_IS_IN_MEMORY_SPACE
 64 | 
 65 | /*
 66 |  * @brief Find the last non-sign bit in a signed or an unsigned integer value
 67 |  *
 68 |  * @note See <a href="http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind">relevant section</a>
 69 |  * of the CUDA PTX reference for details on this instruction.
 70 |  *
 71 |  * @param val the value in which to find non-sign bits
 72 |  * @return the bit index (counting from least significant bit being 0) of the first
 73 |  * bit which is 0 if @p val is positive, or of the first bit which is 1 if @p val is negative. If @p val has only
 74 |  * sign bits (i.e. if it's 0 or if its type is signed and its bits are all 1) - the value 0xFFFFFFFF (-1) is returned
 75 |  */
 76 | 
 77 | #define DEFINE_BFIND(ptx_type) \
 78 | KAT_FD uint32_t \
 79 | bfind(CPP_TYPE_BY_PTX_TYPE(ptx_type) val) \
 80 | { \
 81 | 	uint32_t ret;  \
 82 | 	asm ( \
 83 | 		"bfind." PTX_STRINGIFY(ptx_type) " %0, %1;" \
 84 | 		: "=r"(ret) : SIZE_CONSTRAINT(ptx_type) (val)); \
 85 | 	return ret; \
 86 | }
 87 | 
 88 | DEFINE_BFIND(s32) // bfind
 89 | DEFINE_BFIND(s64) // bfind
 90 | DEFINE_BFIND(u32) // bfind
 91 | DEFINE_BFIND(u64) // bfind
 92 | 
 93 | #undef DEFINE_BFIND
 94 | 
 95 | #define DEFINE_PRMT_WITH_MODE(selection_mode_name, selection_mode) \
 96 | KAT_FD  uint32_t prmt_ ## selection_mode_name (uint32_t first, uint32_t second, uint32_t control_bits) \
 97 | { \
 98 | 	uint32_t result; \
 99 | 	asm("prmt.b32." PTX_STRINGIFY(selection_mode) " %0, %1, %2, %3;" \
100 | 		: "=r"(result) : "r"(first), "r"(second), "r"(control_bits)); \
101 | 	return result; \
102 | }
103 | 
104 | /*
105 |  * See:
106 |  * @url http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
107 |  * for information about these instructions
108 |  */
109 | DEFINE_PRMT_WITH_MODE( forward_4_extract,  f4e  ) // prmt_forward_4_extract
110 | DEFINE_PRMT_WITH_MODE( backward_4_extract, b4e  ) // prmt_backward_4_extract
111 | DEFINE_PRMT_WITH_MODE( replicate_8,        rc8  ) // prmt_replicate_8
112 | DEFINE_PRMT_WITH_MODE( replicate_16,       rc16 ) // prmt_replicate_16
113 | DEFINE_PRMT_WITH_MODE( edge_clam_left,     ecl  ) // prmt_edge_clam_left
114 | DEFINE_PRMT_WITH_MODE( edge_clam_right,    ecl  ) // prmt_edge_clam_right
115 | 
116 | 
117 | /**
118 |  * @brief See: <a href="http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt">relevant section</a>
119 |  * of the CUDA PTX reference for an explanation of what this does exactly
120 |  *
121 |  * @param first           a first value from which to potentially use bytes
122 |  * @param second          a second value from which to potentially use bytes
123 |  * @param byte_selectors  a packing of 4 selector structures; each selector structure
124 |  *                        is 3 bits specifying which of the input bytes are to be used (as there are 8
125 |  *                        bytes overall in @p first and @p second ), and another bit specifying if it's an
126 |  *                        actual copy of a byte, or instead whether the sign of the byte (intrepeted as
127 |  *                        an int8_t) should be replicated to fill the target byte.
128 |  * @return the four bytes of first and/or second, or replicated signs thereof, indicated by the byte selectors
129 |  *
130 |  * @note Only the lower 16 bits of byte_selectors are used.
131 |  * @note "prmt" stands for "permute"
132 |  */
133 | KAT_FD uint32_t prmt(uint32_t first, uint32_t second, uint32_t byte_selectors)
134 | {
135 | 	uint32_t result;
136 | 	asm("prmt.b32 %0, %1, %2, %3;"
137 | 		: "=r"(result) : "r"(first), "r"(second), "r"(byte_selectors));
138 | 	return result;
139 | }
140 | 
141 | 
142 | /**
143 |  * @brief Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting
144 |  * from least to most significant, from a bit field field. Has sign extension semantics
145 |  * for signed inputs which are bit tricky, see in the PTX ISA guide:
146 |  *
147 |  * http://docs.nvidia.com/cuda/parallel-thread-execution/index.html
148 |  *
149 |  * TODO: CUB 1.5.2's BFE wrapper seems kind of fishy. Why does Duane Merill not use PTX for extraction from 64-bit fields?
150 |  * I'll take a different route.
151 |  */
152 | #define DEFINE_BFE(ptx_type) \
153 | KAT_FD CPP_TYPE_BY_PTX_TYPE(ptx_type) \
154 | bfe( \
155 | 	CPP_TYPE_BY_PTX_TYPE(ptx_type) bits, \
156 | 	uint32_t start_position, \
157 | 	uint32_t num_bits) \
158 | { \
159 | 	CPP_TYPE_BY_PTX_TYPE(ptx_type) extracted_bits;  \
160 | 	asm ( \
161 | 		"bfe." PTX_STRINGIFY(ptx_type) " %0, %1, %2, %3;" \
162 | 		: "=" SIZE_CONSTRAINT(ptx_type) (extracted_bits) \
163 | 		: SIZE_CONSTRAINT(ptx_type) (bits) \
164 | 		, "r" (start_position) \
165 | 		, "r" (num_bits) \
166 | 	);\
167 | 	return extracted_bits; \
168 | }
169 | 
170 | DEFINE_BFE(s32) // bfe
171 | DEFINE_BFE(s64) // bfe
172 | DEFINE_BFE(u32) // bfe
173 | DEFINE_BFE(u64) // bfe
174 | 
175 | #undef DEFINE_BFE
176 | 
177 | KAT_FD uint32_t
178 | bfi(
179 | 	uint32_t  bits_to_insert,
180 | 	uint32_t  existing_bit_field,
181 | 	uint32_t  start_position,
182 | 	uint32_t  num_bits)
183 | {
184 | 	uint32_t ret;
185 | 	asm (
186 | 		"bfi.b32 %0, %1, %2, %3, %4;"
187 | 		: "=r"(ret)
188 | 		: "r"(bits_to_insert)
189 | 		, "r"(existing_bit_field)
190 | 		, "r"(start_position)
191 | 		, "r"(num_bits)
192 | 	);
193 | 	return ret;
194 | }
195 | 
196 | KAT_FD uint64_t
197 | bfi(
198 | 	uint64_t  bits_to_insert,
199 | 	uint64_t  existing_bit_field,
200 | 	uint32_t  start_position,
201 | 	uint32_t  num_bits)
202 | {
203 | 	uint64_t ret;
204 | 	asm (
205 | 		"bfi.b64 %0, %1, %2, %3, %4;"
206 | 		: "=l"(ret)
207 | 		: "l"(bits_to_insert)
208 | 		, "l"(existing_bit_field)
209 | 		, "r"(start_position)
210 | 		, "r"(num_bits)
211 | 	);
212 | 	return ret;
213 | }
214 | 
215 | /**
216 |  * @brief Adds the absolute difference of two values to a base value
217 |  *
218 |  * @param x value from which to subtract @p y
219 |  * @param y value to subtract from @p x
220 |  * @param addend base value to which to add `|x-y|`
221 |  *
222 |  * @return `addend + |x - y|`
223 |  */
224 | #define DEFINE_SAD(ptx_type_1, unsigned_ptx_type_1) \
225 | KAT_FD CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) sad( \
226 | 	CPP_TYPE_BY_PTX_TYPE(ptx_type_1) x, \
227 | 	CPP_TYPE_BY_PTX_TYPE(ptx_type_1) y, \
228 | 	CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) addend) \
229 | { \
230 | 	CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) result;  \
231 | 	asm ( \
232 | 		"sad." PTX_STRINGIFY(ptx_type_1) " %0, %1, %2, %3;" \
233 | 		: "=" SIZE_CONSTRAINT(unsigned_ptx_type_1) (result) \
234 | 		: SIZE_CONSTRAINT(ptx_type_1) (x) \
235 | 		, SIZE_CONSTRAINT(ptx_type_1) (y) \
236 | 		, SIZE_CONSTRAINT(unsigned_ptx_type_1) (addend) \
237 | 	);\
238 | 	return result; \
239 | }
240 | 
241 | #define DEFINE_SAD_(x) DEFINE_SAD(x, MAKE_UNSIGNED(x));
242 | DEFINE_SAD_(u16);
243 | DEFINE_SAD_(u32);
244 | DEFINE_SAD_(u64);
245 | DEFINE_SAD_(s16);
246 | DEFINE_SAD_(s32);
247 | DEFINE_SAD_(s64);
248 | 
249 | #undef DEFINE_SAD_
250 | #undef DEFINE_SAD
251 | 
252 | } // namespace ptx
253 | } // namespace kat
254 | 
255 | 
256 | #include "detail/undefine_macros.cuh"
257 | 
258 | #endif // CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_
259 | 
260 | 


--------------------------------------------------------------------------------
/src/kat/on_device/ptx/special_registers.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file kat/on_device/ptx/special_registers.cuh
 3 |  *
 4 |  * @brief PTX instruction wrapper functions for accessing special on-GPU-core registers.
 5 |  */
 6 | #pragma once
 7 | #ifndef CUDA_KAT_PTX_SPECIAL_REGISTERS_CUH_
 8 | #define CUDA_KAT_PTX_SPECIAL_REGISTERS_CUH_
 9 | 
10 | #include "detail/define_macros.cuh"
11 | 
12 | namespace kat {
13 | namespace ptx {
14 | 
15 | /**
16 |  * @brief Wrappers for instructions obtaining the value of one of the special hardware registers on nVIDIA GPUs.
17 |  *
18 |  * See the <a href="http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers">relevant section</a>
19 |  * of the PTX instruction set guide for more details.
20 |  */
21 | namespace special_registers {
22 | 
23 | 
24 | #define DEFINE_SPECIAL_REGISTER_GETTER(special_register_name, ptx_value_type) \
25 | KAT_FD CPP_TYPE_BY_PTX_TYPE(ptx_value_type) special_register_name() \
26 | { \
27 | 	CPP_TYPE_BY_PTX_TYPE(ptx_value_type) ret;  \
28 | 	asm volatile ("mov." PTX_STRINGIFY(ptx_value_type) "%0, %" PTX_STRINGIFY(special_register_name) ";" : "=" SIZE_CONSTRAINT(ptx_value_type) (ret)); \
29 | 	return ret; \
30 | } \
31 | 
32 | DEFINE_SPECIAL_REGISTER_GETTER( laneid,             u32); // PTX 1.3
33 | DEFINE_SPECIAL_REGISTER_GETTER( gridid,             u64); // PTX 3.0
34 | DEFINE_SPECIAL_REGISTER_GETTER( smid,               u32); // PTX 1.3
35 | DEFINE_SPECIAL_REGISTER_GETTER( nsmid,              u32); // PTX 2.0
36 | DEFINE_SPECIAL_REGISTER_GETTER( clock,              u32); // PTX 1.0
37 | DEFINE_SPECIAL_REGISTER_GETTER( clock_hi,           u32); // PTX 5.0
38 | DEFINE_SPECIAL_REGISTER_GETTER( clock64,            u64); // PTX 2.0
39 | DEFINE_SPECIAL_REGISTER_GETTER( globaltimer_hi,     u32); // PTX 3.1
40 | DEFINE_SPECIAL_REGISTER_GETTER( globaltimer_lo,     u32); // PTX 3.1
41 | DEFINE_SPECIAL_REGISTER_GETTER( globaltimer,        u64); // PTX 3.1
42 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_lt,        u32); // PTX 2.0
43 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_le,        u32); // PTX 2.0
44 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_eq,        u32); // PTX 2.0
45 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_ge,        u32); // PTX 2.0
46 | DEFINE_SPECIAL_REGISTER_GETTER( lanemask_gt,        u32); // PTX 2.0
47 | DEFINE_SPECIAL_REGISTER_GETTER( dynamic_smem_size,  u32); // PTX 4.1
48 | DEFINE_SPECIAL_REGISTER_GETTER( total_smem_size,    u32); // PTX 4.1
49 | 
50 | #undef DEFINE_SPECIAL_REGISTER_GETTER
51 | 
52 | 
53 | /*
54 |  * Not defining getters for:
55 |  *
56 |  * %tid                      - available as threadIdx
57 |  * %ntid                     - available as blockDim
58 |  * %warpid                   - not interesting
59 |  * %nwarpid                  - not interesting
60 |  * %ctaid                    - available as blockId
61 |  * %nctaid                   - available as gridDim
62 |  * %pm0, ..., %pm7           - not interesting, for now (performance monitoring)
63 |  * %pm0_64, ..., %pm7_64     - not interesting, for now (performance monitoring)
64 |  * %envreg0, ..., %envreg31  - not interesting, for now
65 |  */
66 | 
67 | 
68 | } // namespace special_registers
69 | 
70 | } // namespace ptx
71 | 
72 | } // namespace kat
73 | 
74 | #include "detail/undefine_macros.cuh"
75 | 
76 | 
77 | #endif // CUDA_KAT_PTX_SPECIAL_REGISTERS_CUH_
78 | 


--------------------------------------------------------------------------------
/src/kat/on_device/ptx/video_instructions.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file ptx/video_instructions.cuh Non-templated wrappers for PTX "video"
 3 |  * instructions, which nVIDIA does not provide wrappers for through the CUDA
 4 |  * `<device_functions.h>` header
 5 |  *
 6 |  * "Video" instructions are not really about video (although they're probably used
 7 |  * for video somehow). Essentially they're instructions which combine another
 8 |  * operation, and another operand, after the main one; additionally, they offer
 9 |  * variants with all sorts of saturation, wraparound, sign-extension and similar
10 |  * bells and whistles.
11 |  *
12 |  * These instructions (at least, the "scalar" ones) are:
13 |  *
14 |  *
15 |  *  vadd       - addition
16 |  *  vsub       - subtraction
17 |  *  vabsdiff   - absolute difference
18 |  *  vmin       - minimum
19 |  *  vmax       - maximum
20 |  *  vshl       - shift left
21 |  *  vshr       - shift right
22 |  *  vmad       - multiply-and-add
23 |  *  vset       - equality check
24 |  *
25 |  * For now, we won't implement most of these instructions, and even for the ones
26 |  * we do implement - we'll only choose some of the variants.
27 |  */
28 | #pragma once
29 | #ifndef CUDA_KAT_PTX_VIDEO_INSTRUCTIONS_CUH_
30 | #define CUDA_KAT_PTX_VIDEO_INSTRUCTIONS_CUH_
31 | 
32 | #include "detail/define_macros.cuh"
33 | #include <kat/on_device/common.cuh>
34 | #include <type_traits>
35 | 
36 | 
37 | ///@cond
38 | #include <kat/detail/execution_space_specifiers.hpp>
39 | ///@endcond
40 | 
41 | namespace kat {
42 | namespace ptx {
43 | 
44 | /**
45 |  * @brief bit shift, then apply a binary operator.
46 |  *
47 |  */
48 | #define DEFINE_SHIFT_AND_OP(direction, second_op) \
49 | KAT_FD uint32_t \
50 | vsh##direction##_##second_op ( \
51 | 	uint32_t x, \
52 | 	uint32_t shift_amount, \
53 | 	uint32_t extra_operand) \
54 | { \
55 | 	uint32_t ret; \
56 | 	asm ("vsh" PTX_STRINGIFY(direction) ".u32.u32.u32.clamp." PTX_STRINGIFY(second_op) " %0, %1, %2, %3;" \
57 | 		: "=r"(ret)  \
58 | 		: "r"(x) \
59 | 		, "r"(shift_amount) \
60 | 		, "r"(extra_operand) \
61 | 	); \
62 | 	return ret; \
63 | }
64 | 
65 | DEFINE_SHIFT_AND_OP(l,add) // vshl_add
66 | DEFINE_SHIFT_AND_OP(l,min) // vshl_min
67 | DEFINE_SHIFT_AND_OP(l,max) // vshl_max
68 | DEFINE_SHIFT_AND_OP(r,add) // vshr_add
69 | DEFINE_SHIFT_AND_OP(r,min) // vshr_min
70 | DEFINE_SHIFT_AND_OP(r,max) // vshr_max
71 | 
72 | 
73 | } // namespace ptx
74 | } // namespace kat
75 | 
76 | 
77 | #include "detail/undefine_macros.cuh"
78 | 
79 | #endif // CUDA_KAT_PTX_VIDEO_INSTRUCTIONS_CUH_
80 | 


--------------------------------------------------------------------------------
/src/kat/on_device/sequence_ops/common.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file on_device/sequence_ops/common.cuh
 3 |  *
 4 |  * @brief Some common definitions for all on-device collaborative sequence operations
 5 |  */
 6 | 
 7 | #ifndef CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_COMMON_CUH_
 8 | #define CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_COMMON_CUH_
 9 | 
10 | #include <kat/on_device/common.cuh>
11 | #include <kat/on_device/math.cuh>
12 | 
13 | namespace kat {
14 | namespace collaborative {
15 | 
16 | enum inclusivity_t : bool {
17 | 	Exclusive = false,
18 | 	Inclusive = true
19 | };
20 | 
21 | namespace detail {
22 | 
23 | /**
24 |  * In a "full warp write", we want [1] each lane to write an integral number
25 |  * native words (at the moment and for the foreseeable future, 4-byte integers).
26 |  * At the same time, the lane writes complete elements of type T, not arbitrary
27 |  * sequences of `sizeof(native_word_t)`, hence this definition.
28 |  *
29 |  * @todo: Can't we assume that T is a POD type, and just have lanes not write
30 |  * complete T's?
31 |  */
32 | template <typename T>
33 | struct elements_per_lane_in_full_warp_write {
34 |     enum { value = sizeof(native_word_t) / constexpr_::gcd<unsigned>(sizeof(native_word_t),sizeof(T)) };
35 | };
36 | } // namespace detail
37 | 
38 | } // namespace collaborative
39 | } // namespace kat
40 | 
41 | #endif // CUDA_KAT_ON_DEVICE_SEQUENCE_OPS_COMMON_CUH_
42 | 


--------------------------------------------------------------------------------
/src/kat/on_device/sequence_ops/grid.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file on_device/collaboration/grid.cuh
 3 |  *
 4 |  * @brief CUDA device computation grid-level primitives, i.e. those involving
 5 |  * interaction of threads from different blocks in the grid
 6 |  *
 7 |  */
 8 | 
 9 | #pragma once
10 | #ifndef CUDA_KAT_GRID_COLLABORATIVE_SEQUENCE_OPS_CUH_
11 | #define CUDA_KAT_GRID_COLLABORATIVE_SEQUENCE_OPS_CUH_
12 | 
13 | #include "common.cuh"
14 | #include <kat/on_device/collaboration/grid.cuh>
15 | #include <kat/on_device/sequence_ops/warp.cuh>
16 | 
17 | ///@cond
18 | #include <kat/detail/execution_space_specifiers.hpp>
19 | ///@endcond
20 | 
21 | namespace kat {
22 | namespace collaborative {
23 | namespace warp_to_grid {
24 | 
25 | /**
26 |  * Used by multiple warps, in multiple blocks with each warp having
27 |  * a bunch of data it has obtained and all warps' data must be
28 |  * chained into a global-memory vector - with no gaps and no
29 |  * overwriting (but not necessarily in the order of warps, just any
30 |  * order.)
31 |  *
32 |  * @note if the input is not 32-byte (sometimes 128-byte )-aligned,
33 |  * and more importantly, the output is not 128-byte-aligned,
34 |  * performance will likely degrade due to the need to execute a pair
35 |  * of memory transactions for every single 32 x 4 byte write.
36 |  *
37 |  * @note this must be called by complete warps, with all lanes
38 |  * active and participating. But it does _not_ - for the time
39 |  * being - have to called by complete blocks.
40 |  *
41 |  * @tparam T the type of data elements being copied
42 |  * @tparam Size must fit any index used into the input or output array;
43 |  * for the general case it would be 64-bit, but this is
44 |  * usable also for when you need 32-bit work (e.g. a 32-bit length
45 |  * output variable).
46 |  * @param global_output
47 |  * @param global_output_length
48 |  * @param fragment_to_append
49 |  * @param fragment_length
50 |  */
51 | template <typename T, typename Size = size_t>
52 | KAT_FD void append_to_global_memory(
53 | 	T*     __restrict__  global_output,
54 | 	Size*  __restrict__  global_output_length,
55 | 	T*     __restrict__  fragment_to_append,
56 | 	Size   __restrict__  fragment_length)
57 | {
58 | 	using namespace grid_info;
59 | 	Size previous_output_size = thread::is_first_in_warp() ?
60 | 		atomic::add(global_output_length, fragment_length) : 0;
61 | 	Size offset_to_start_writing_at = collaborative::warp::get_from_first_lane(
62 | 		previous_output_size);
63 | 
64 | 	// Now the (0-based) positions
65 | 	// previous_output_size ... previous_output_size + fragment_length - 1
66 | 	// are reserved by this warp; nobody else will write there and we don't need
67 | 	// any more atomics
68 | 
69 | 	enum : bool { may_have_slack = true };
70 | 
71 | 	if (detail::elements_per_lane_in_full_warp_write<T>::value > 1) {
72 | 		// We don't have a version of copy which handles unaligned destinations, so
73 | 		warp::detail::naive_copy(global_output + offset_to_start_writing_at,
74 | 			fragment_to_append, fragment_length);
75 | 	}
76 | 	else {
77 | 		warp::copy_n<T, Size,  may_have_slack>(
78 | 			global_output + offset_to_start_writing_at,
79 | 			fragment_to_append, fragment_length);
80 | 	}
81 | }
82 | 
83 | } // namespace warp_to_grid
84 | } // namespace collaborative
85 | } // namespace kat
86 | 
87 | #endif // CUDA_KAT_GRID_COLLABORATIVE_SEQUENCE_OPS_CUH_
88 | 


--------------------------------------------------------------------------------
/src/kat/on_device/shared_memory.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file on_device/shared_memory.cuh
 3 |  *
 4 |  * @brief Utility code for working with (mostly dynamic) shared memory
 5 |  * in device-side CUDA functions.
 6 |  */
 7 | 
 8 | #ifndef CUDA_KAT_ON_DEVICE_SHARED_MEMORY_CUH_
 9 | #define CUDA_KAT_ON_DEVICE_SHARED_MEMORY_CUH_
10 | 
11 | #include "shared_memory/basic.cuh"
12 | #include "shared_memory/operations.cuh"
13 | 
14 | #endif // CUDA_KAT_ON_DEVICE_SHARED_MEMORY_CUH_
15 | 


--------------------------------------------------------------------------------
/src/kat/on_device/shared_memory/basic.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/shared_memory/basic.cuh
  3 |  *
  4 |  * @brief Simpler / more basic utility code for working with shared memory,
  5 |  * not involving any actual computation.
  6 |  *
  7 |  */
  8 | 
  9 | #ifndef CUDA_KAT_ON_DEVICE_SHARED_MEMORY_BASIC_CUH_
 10 | #define CUDA_KAT_ON_DEVICE_SHARED_MEMORY_BASIC_CUH_
 11 | 
 12 | #include <kat/on_device/grid_info.cuh>
 13 | #include <kat/on_device/ptx.cuh>
 14 | 
 15 | 
 16 | ///@cond
 17 | #include <kat/detail/execution_space_specifiers.hpp>
 18 | ///@endcond
 19 | 
 20 | namespace kat {
 21 | namespace shared_memory {
 22 | 
 23 | using offset_t = int; // Perhaps make it an int32_t ?
 24 | using size_t = unsigned; // Should we make it signed like ssize_t ?
 25 | 
 26 | /**
 27 |  * @brief Obtain the total size in bytes of the (per-block) shared memory
 28 |  * for the running kernel - static + dynamic
 29 |  *
 30 |  * @note requires special register access which is not so cheap.
 31 |  *
 32 |  */
 33 | KAT_FD size_t size() {
 34 | 	return ptx::special_registers::total_smem_size();
 35 | }
 36 | 
 37 | namespace static_ {
 38 | 
 39 | /**
 40 |  * @brief Obtain the size in bytes of the (per-block) static shared memory
 41 |  * for the running kernel.
 42 |  *
 43 |  * @note requires special register access which is not so cheap.
 44 |  */
 45 | KAT_FD size_t size() {
 46 | 	return
 47 | 		ptx::special_registers::total_smem_size() -
 48 | 		ptx::special_registers::dynamic_smem_size();
 49 | }
 50 | 
 51 | } // namespace static_
 52 | 
 53 | namespace dynamic {
 54 | 
 55 | /**
 56 |  * @brief Obtain the size of the (per-block) dynamic shared_memory for
 57 |  * the running kernel
 58 |  *
 59 |  * @note without a template parameter, returns the size in bytes
 60 |  * @note requires special register access which is not so cheap.
 61 |  */
 62 | template <typename T = unsigned char>
 63 | KAT_FD size_t size() {
 64 | 	return ptx::special_registers::dynamic_smem_size() / sizeof(T);
 65 | }
 66 | 
 67 | /**
 68 |  * This gadget is necessary for using dynamically-sized shared memory in
 69 |  * templated kernels (i.e. shared memory whose size is set by the launch
 70 |  * parameters rather than being fixed at compile time). Use of such
 71 |  * memory  requires a `__shared__ extern` unspecified-size array variable;
 72 |  * however, the way nvcc works, you cannot declare two such variables of
 73 |  * different types in your program - even if they're in different scopes.
 74 |  * That means we either need to have a different variable name for each
 75 |  * type (which would lead us into preprocessor macro hell), or - just
 76 |  * use the same type, and reintrepret according to the type we want...
 77 |  * which is what this gadget does.
 78 |  *
 79 |  * @note all threads would get the same address when calling this function,
 80 |  * so you would need to add different offsets for different threads if
 81 |  * you want a warp-specific or thread-specific pointer.
 82 |  *
 83 |  * @note see also https://stackoverflow.com/questions/27570552/
 84 |  */
 85 | template <typename T>
 86 | KAT_DEV T* proxy()
 87 | {
 88 | 	// TODO: Do we need this alignment? Probably not
 89 | 	extern __shared__ __align__(1024) unsigned char memory[];
 90 | 	return reinterpret_cast<T*>(memory);
 91 | }
 92 | 
 93 | // TODO: It would be nice to get the shared memory as a span; but we
 94 | // don't currently have a span in this repository; and both std::span
 95 | // and GSL/span do not support CUDA.
 96 | 
 97 | /**
 98 |  * @note This namespace's contents is only relevant for linear grids
 99 |  */
100 | namespace warp_specific {
101 | 
102 | /**
103 |  * @brief Accesses the calling thread's warp-specific dynamic shared memory -
104 |  * assuming the warps voluntarily divvy up the shared memory beyond some
105 |  * point amongst themselves, using striding.
106 |  *
107 |  * The partitioning pattern is for each warp to get elements at a fixed
108 |  * stride rather than a contiguous set of elements; this pattern ensures
109 |  * that different warps are never in a bank conflict when accessing their
110 |  * "private" shared memory - provided the number of warps divides 32, or is a
111 |  * multiple of 32. The downside of this pattern is that different lanes accessing
112 |  * different elements in a warp's shared memory will likely be in bank conflict
113 |  * (and certainly be in conflict if there are 32 warps).
114 |  *
115 |  * @tparam T the element type assumed for all shared memory (or at least for
116 |  * alignment and for the warp-specific shared memory)
117 |  * @param base_offset How far into the block's overall shared memory to
118 |  * start partitioning the memory into warp-specific sequences
119 |  * @param num_elements_per_warp Size in elements of the area agreed to
120 |  * be specific to each warp
121 |  * @return Address of the first warp-specific element in shared memory
122 |  */
123 | template <typename T>
124 | KAT_FD T* contiguous(unsigned num_elements_per_warp, offset_t base_offset = 0)
125 | {
126 | 	return proxy<T>() + base_offset +
127 | 		num_elements_per_warp * linear_grid::grid_info::warp::index_in_block();
128 | }
129 | 
130 | /**
131 |  * @brief Accesses the calling thread's warp-specific dynamic shared memory -
132 |  * assuming the warps voluntarily divvy up the shared memory beyond some
133 |  * point amongst themselves into contiguous areas.
134 |  *
135 |  * The partitioning pattern is for each warp to get a contiguous sequence
136 |  * of elements in memory.
137 |  *
138 |  * @tparam T the element type assumed for all shared memory (or at least for
139 |  * alignment and for the warp-specific shared memory)
140 |  * @param base_offset How far into the block's overall shared memory to
141 |  * start partitioning the memory into warp-specific sequences
142 |  * @return Address of the first warp-specific element in shared memory
143 |  */
144 | template <typename T>
145 | KAT_FD T* strided(offset_t base_offset = 0)
146 | {
147 | 	return proxy<T>() + base_offset + linear_grid::grid_info::warp::index_in_block();
148 | }
149 | 
150 | } // namespace warp_specific
151 | 
152 | } // namespace dynamic
153 | } // namespace shared_memory
154 | } // namespace kat
155 | 
156 | #endif // CUDA_KAT_ON_DEVICE_SHARED_MEMORY_BASIC_CUH_
157 | 


--------------------------------------------------------------------------------
/src/kat/on_device/shared_memory/operations.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/shared_memory/operations.cuh
  3 |  *
  4 |  * @brief Some basic operations on shared memory (using the library's general
  5 |  * computational primitives)
  6 |  *
  7 |  */
  8 | 
  9 | #pragma once
 10 | #ifndef CUDA_KAT_SHARED_MEMORY_OPS_CUH
 11 | #define CUDA_KAT_SHARED_MEMORY_OPS_CUH
 12 | 
 13 | #include <kat/on_device/shared_memory/basic.cuh>
 14 | #include <kat/on_device/sequence_ops/block.cuh>
 15 | 
 16 | 
 17 | ///@cond
 18 | #include <kat/detail/execution_space_specifiers.hpp>
 19 | ///@endcond
 20 | 
 21 | namespace kat {
 22 | namespace linear_grid {
 23 | namespace shared_memory {
 24 | 
 25 | using kat::shared_memory::size_t;
 26 | 
 27 | namespace dynamic {
 28 | 
 29 | using kat::shared_memory::dynamic::proxy;
 30 | using kat::shared_memory::dynamic::size;
 31 | 
 32 | /**
 33 |  * @brief Collaboratively fill the block's dynamic shared memory with a fixed
 34 |  * value, up to a certain point
 35 |  *
 36 |  * @tparam the element type which the block's shared memory is presumed to have
 37 |  * @param value each element of the block's dynamic shared memory will be
 38 |  * set to this value
 39 |  * @param length the number of T elements to set to @p value
 40 |  */
 41 | template <typename T>
 42 | KAT_FD void fill(
 43 | 	const T&               value,
 44 | 	shared_memory::size_t  length)
 45 | {
 46 | 	T tmp = value;
 47 | 	kat::linear_grid::collaborative::block::fill_n(shared_memory::dynamic::proxy<T>(), value, length);
 48 | }
 49 | 
 50 | /**
 51 |  * @brief Collaboratively fill the block's dynamic shared memory with a fixed value.
 52 |  *
 53 |  * @tparam the element type which the block's shared memory is presumed to have
 54 |  * @param value each element of the block's dynamic shared memory will be
 55 |  * set to this value
 56 |  *
 57 |  * @note This variant of `fill()` pays a small "penality" for determining
 58 |  * the size of the shared memory by itself, since it must access a
 59 |  * typically-unused special register for this purpose. If you can, prefer
 60 |  * passing a length yourself.
 61 |  */
 62 | template <typename T>
 63 | KAT_FD void fill(const T& value)
 64 | {
 65 | 	auto length = shared_memory::dynamic::size<T>();
 66 | 	return fill(value, length);
 67 | }
 68 | 
 69 | /**
 70 |  * @brief Collaboratively zero-out the block's dynamic shared memory , up to a
 71 |  * certain point
 72 |  *
 73 |  * @tparam the element type which the block's shared memory is presumed to have
 74 |  * @param length the number of T elements to set to zero
 75 |  */
 76 | template <typename T>
 77 | KAT_FD void zero(kat::shared_memory::size_t length)
 78 | {
 79 | 	return fill(T{0}, length);
 80 | }
 81 | 
 82 | /**
 83 |  * @brief Collaboratively zero-out the block's dynamic shared memory
 84 |  *
 85 |  * @tparam the element type which the block's shared memory is presumed to have
 86 |  */
 87 | template <typename T>
 88 | KAT_FD void zero()
 89 | {
 90 | 	auto length = shared_memory::dynamic::size<T>();
 91 | 	return zero(length);
 92 | }
 93 | 
 94 | 
 95 | /**
 96 |  * Sets the (beginning of the dynamic) shared memory of the block
 97 |  * to a copy of some area of device memory.
 98 |  *
 99 |  * @param[in]  source Data in global memory (_not_ anywhere
100 |  * else in shared memory! That breaks the {@code __restrict__}
101 |  * restriction) which we wish to have in shared memory
102 |  * @param[in] length length of the area to copy; must be
103 |  * no larger than the available length (in T's) of shared
104 |  * memory
105 |  * @return the beginning of the block's shared memory -
106 |  * which now contains a copy of the data at @p source.
107 |  *
108 |  * @note length is not checked to be valid - it is up to
109 |  * the caller to refrain from trying to copy too much
110 |  * into the shared memory; use
111 |  */
112 | template <typename T>
113 | KAT_FD T* __restrict__ set_to_copy_of(const T*  source, shared_memory::size_t length)
114 | {
115 | 	T* __restrict__ data_in_shared_mem = shared_memory::dynamic::proxy<T>();
116 | 	kat::linear_grid::collaborative::block::copy(data_in_shared_mem, source, length);
117 | 	return data_in_shared_mem;
118 | }
119 | 
120 | } // namespace shared_memory
121 | } // namespace dynamic
122 | } // namespace linear_grid
123 | 
124 | namespace shared_memory {
125 | namespace dynamic {
126 | 
127 | /**
128 |  * @brief Collaboratively fill the block's dynamic shared memory with a fixed
129 |  * value, up to a certain point
130 |  *
131 |  * @tparam the element type which the block's shared memory is presumed to have
132 |  * @param value each element of the block's dynamic shared memory will be
133 |  * set to this value
134 |  * @param length the number of T elements to set to @p value
135 |  *
136 |  * @note Not implemented yet - need non-linear-grid variants of some of the block primtives.
137 |  */
138 | template <typename T>
139 | KAT_FD void fill(
140 | 	const T&               value,
141 | 	shared_memory::size_t  length);
142 | // TODO: Uncomment when the non-linear-grid block primitive is available
143 | //{
144 | //	T tmp = value;
145 | //	kat::collaborative::block::fill_n(shared_memory::dynamic::proxy<T>(), value, length);
146 | //}
147 | 
148 | 
149 | /**
150 |  * @brief Collaboratively fill the block's dynamic shared memory with a fixed value.
151 |  *
152 |  * @tparam the element type which the block's shared memory is presumed to have
153 |  * @param value each element of the block's dynamic shared memory will be
154 |  * set to this value
155 |  *
156 |  * @note This variant of `fill()` pays a small "penality" for determining
157 |  * the size of the shared memory by itself, since it must access a
158 |  * typically-unused special register for this purpose. If you can, prefer
159 |  * passing a length yourself.
160 |  */
161 | template <typename T>
162 | KAT_FD void fill(const T& value)
163 | {
164 | 	auto length = shared_memory::dynamic::size<T>();
165 | 	return fill(value, length);
166 | }
167 | 
168 | /**
169 |  * @brief Collaboratively zero-out the block's dynamic shared memory , up to a
170 |  * certain point
171 |  *
172 |  * @tparam the element type which the block's shared memory is presumed to have
173 |  * @param length the number of T elements to set to zero
174 |  */
175 | template <typename T>
176 | KAT_FD void zero(kat::shared_memory::size_t length)
177 | {
178 | 	return fill(T{0}, length);
179 | }
180 | 
181 | /**
182 |  * @brief Collaboratively zero-out the block's dynamic shared memory
183 |  *
184 |  * @tparam the element type which the block's shared memory is presumed to have
185 |  */
186 | template <typename T>
187 | KAT_FD void zero()
188 | {
189 | 	auto length = shared_memory::dynamic::size<T>();
190 | 	return zero(length);
191 | }
192 | 
193 | /**
194 |  * Sets the (beginning of the dynamic) shared memory of the block
195 |  * to a copy of some area of device memory.
196 |  *
197 |  * @param[in]  source Data in global memory (_not_ anywhere
198 |  * else in shared memory! That breaks the {@code __restrict__}
199 |  * restriction) which we wish to have in shared memory
200 |  * @param[in] length length of the area to copy; must be
201 |  * no larger than the available length (in T's) of shared
202 |  * memory
203 |  * @return the beginning of the block's shared memory -
204 |  * which now contains a copy of the data at @p source.
205 |  *
206 |  * @note length is not checked to be valid - it is up to
207 |  * the caller to refrain from trying to copy too much
208 |  * into the shared memory.
209 |  *
210 |  * @note Not implemented yet - need non-linear-grid variants of
211 |  * some of the block primitives.
212 |  */
213 | template <typename T>
214 | KAT_FD T* __restrict__ set_to_copy_of(const T*  source, shared_memory::size_t length);
215 | // TODO: Uncomment when the non-linear-grid block primitive is available
216 | //{
217 | //	T* __restrict__ data_in_shared_mem = shared_memory::dynamic::proxy<T>();
218 | //	kat::collaborative::block::copy(data_in_shared_mem, source, length);
219 | //	return data_in_shared_mem;
220 | //}
221 | 
222 | } // namespace dynamic
223 | } // namespace shared_memory
224 | 
225 | } // namespace kat
226 | 
227 | #endif // CUDA_KAT_SHARED_MEMORY_OPS_CUH
228 | 


--------------------------------------------------------------------------------
/src/kat/on_device/shuffle.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file shuffle.cuh Templated warp-shuffle operation variants
 3 |  */
 4 | 
 5 | /*
 6 |  * Originally based on Bryan Catanzaro's CUDA generics
 7 |  * https://github.com/bryancatanzaro/generics/
 8 |  * Downloaded on: 2016-04-16
 9 |  * ... but reimplemented by Eyal Rozenberg, CWI Amsterdam
10 |  */
11 | 
12 | #pragma once
13 | #ifndef CUDA_KAT_ON_DEVICE_TEMPLATED_SHUFFLE_CUH_
14 | #define CUDA_KAT_ON_DEVICE_TEMPLATED_SHUFFLE_CUH_
15 | 
16 | #include <kat/on_device/common.cuh>
17 | 
18 | 
19 | ///@cond
20 | #include <kat/detail/execution_space_specifiers.hpp>
21 | ///@endcond
22 | 
23 | namespace kat {
24 | 
25 | // The functions here can be used to shuffle types as large as you like. Of course,
26 | // if they're not plain-old-data, shuffle at your peril.
27 | 
28 | /**
29 |  * @brief Have each lane in a warp get a value from an (arbitrary) other lane.
30 |  *
31 |  * @tparam T the type of datum to be shared with other lane(s); may be of
32 |  * arbitrary size, but (at least for now) must be plain-old-data.
33 |  *
34 |  * @param t Each lane shares own value, which other lanes can choose
35 |  * to receive.
36 |  * @param source_lane The lane whose value the current lane wants to get
37 |  * @return the @p t value of @p source_lane
38 |  */
39 | template<typename T> KAT_FD T shuffle_arbitrary(const T& t, int source_lane);
40 | 
41 | /**
42 |  * @param t Each lane shares own value, which a lane with a higher index
43 |  * will get.
44 |  * @param delta The difference in lane index to the source lane of the new
45 |  * value, i.e. a lane with index i gets the new value from lane i + delta.
46 |  * @return The @p t value of the lane with index @p delta less than the calling
47 |  * lane's; a lane with a high index, above warp_size - @p delta, has its own @p t
48 |  * returned unchanged.
49 |  */
50 | template<typename T> KAT_FD T shuffle_down(const T& t, unsigned int delta);
51 | 
52 | /**
53 |  * @tparam T the type of datum to be shared with other lane(s); may be of
54 |  * arbitrary size, but (at least for now) must be plain-old-data.
55 |  *
56 |  * @param t Each lane shares own value, which a lane with a lower index
57 |  * will get.
58 |  * @param delta The difference in lane index to the source lane of the new
59 |  * value, i.e. a lane with index i gets the new value from the lane of index
60 |  * i - delta.
61 |  * @return The @p t value of the lane with index @p delta less than the calling
62 |  * lane's; a lane with a low index, under @p delta, has its own @p t returned
63 |  * unchanged.
64 |  */
65 | template<typename T> KAT_FD T shuffle_up(const T& t, unsigned int delta);
66 | 
67 | /**
68 |  * @brief Have pairs of lanes exchange a value, with the pairing performed
69 |  * by XORing bits of the lane index.
70 |  *
71 |  * @tparam T the type of datum to be shared with other lane(s); may be of
72 |  * arbitrary size, but (at least for now) must be plain-old-data.
73 |  *
74 |  * @param t The value to exchange with a counterpart lane
75 |  * @param mask Determines how lanes will be paired: The lane with index i
76 |  * is paired with the lane with index i ^ mask.
77 |  * @return The @p t value of the paired lane
78 |  */
79 | template<typename T> KAT_FD T shuffle_xor(const T& t, int mask);
80 | 
81 | } // namespace kat
82 | 
83 | #include "detail/shuffle.cuh"
84 | 
85 | #endif // CUDA_KAT_ON_DEVICE_TEMPLATED_SHUFFLE_CUH_
86 | 


--------------------------------------------------------------------------------
/src/kat/on_device/streams/prefix_generators.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #ifndef CUDA_KAT_OSTREAM_PREFIX_GENERATORS_CUH_
  3 | #define CUDA_KAT_OSTREAM_PREFIX_GENERATORS_CUH_
  4 | 
  5 | #include <kat/on_device/streams/printfing_ostream.cuh>
  6 | #include <kat/on_device/grid_info.cuh>
  7 | #include <kat/on_device/math.cuh>
  8 | 
  9 | ///@cond
 10 | #include <kat/detail/execution_space_specifiers.hpp>
 11 | ///@endcond
 12 | 
 13 | namespace kat {
 14 | 
 15 | namespace detail {
 16 | KAT_DEV unsigned num_digits_required_for(unsigned long long extremal_value)
 17 | {
 18 | 	return ceilf(log10f(extremal_value));
 19 | }
 20 | 
 21 | } // namespace detail
 22 | 
 23 | namespace linear_grid {
 24 | 
 25 | namespace prefix_generators {
 26 | 
 27 | template <printfing_ostream::resolution IdentityResolution>
 28 | KAT_DEV void self_identify(kat::stringstream& ss);
 29 | 
 30 | // Prefix will look like (example for thread 34):
 31 | //
 32 | // "T 34 = (00,01,02) "
 33 | //
 34 | // ... since 34 is the third thread (index 2) in the second warp (index 1) in
 35 | // the first block.
 36 | //
 37 | template <>
 38 | KAT_DEV void self_identify<printfing_ostream::resolution::thread>(kat::stringstream& ss)
 39 | {
 40 | 	namespace gi = ::kat::linear_grid::grid_info;
 41 | 
 42 | 	const auto global_thread_id_width = detail::num_digits_required_for(gi::grid::num_threads() - 1);
 43 | 	const auto block_id_width         = detail::num_digits_required_for(gi::grid::num_blocks() - 1);
 44 | 	const auto warp_id_width          = detail::num_digits_required_for(gi::grid::num_warps_per_block() - 1);
 45 | 	const auto lane_id_width          = 2; // ceilf(log10(warp_size - 1))
 46 | 	constexpr const auto fill_char = '0';
 47 | 
 48 | 	ss
 49 | 		<< "T " << strf::right(gi::thread::global_id(), global_thread_id_width, fill_char)
 50 | 		<< " = (" << strf::right(gi::block::id_in_grid(), block_id_width, fill_char )
 51 | 		<< ',' << strf::right(gi::warp::id_in_block(), warp_id_width, fill_char)
 52 | 		<< ',' << strf::right(gi::lane::id(), lane_id_width, fill_char)
 53 | 		<< ") ";
 54 | }
 55 | 
 56 | 
 57 | // Prefix will look like (example for thread 1025 and block size 512):
 58 | //
 59 | // "W 32 = (02,00) "
 60 | //
 61 | // ... since thread 1025 overall is the second thread in the third block (block index 2), and thus in the first warp (warp index 0)
 62 | //
 63 | template <>
 64 | KAT_DEV void self_identify<printfing_ostream::resolution::warp>(kat::stringstream& ss)
 65 | {
 66 | 	namespace gi = ::kat::linear_grid::grid_info;
 67 | 
 68 | 	auto global_warp_id_width = detail::num_digits_required_for(gi::grid::num_warps() - 1);
 69 | 	auto warp_id_width        = detail::num_digits_required_for(gi::grid::num_warps_per_block() - 1);
 70 | 	auto block_id_width       = detail::num_digits_required_for(gi::grid::num_blocks() - 1);
 71 | 	constexpr const auto fill_char = '0';
 72 | 	ss
 73 | 		<< "W " << strf::right(gi::warp::id_in_grid(), global_warp_id_width, fill_char)
 74 | 		<< " = (" << strf::right(gi::block::id_in_grid(), block_id_width, fill_char)
 75 | 		<< ',' << strf::right(gi::warp::id_in_block(), warp_id_width, fill_char)
 76 | 		<< ") ";
 77 | }
 78 | 
 79 | // Prefix will look like (example for thread 1025 and block size 512):
 80 | //
 81 | // "B 2 "
 82 | //
 83 | // ... since thread 1025 is  in the 3rd block and block indices are 0-based
 84 | //
 85 | template <>
 86 | KAT_DEV void self_identify<printfing_ostream::resolution::block>(kat::stringstream& ss)
 87 | {
 88 | 	namespace gi = ::kat::linear_grid::grid_info;
 89 | 
 90 | 	const unsigned block_id_width = detail::num_digits_required_for(gi::grid::num_blocks() - 1);
 91 | 	constexpr const auto fill_char = '0';
 92 | 	ss << "B " << strf::right(gi::block::id_in_grid(), block_id_width, fill_char) << " : ";
 93 | }
 94 | 
 95 | template <>
 96 | KAT_DEV void self_identify<printfing_ostream::resolution::grid>(kat::stringstream& ss)
 97 | {
 98 | 	ss << "G ";
 99 | }
100 | 
101 | 
102 | } // namespace prefix_generators
103 | 
104 | namespace manipulators {
105 | 
106 | KAT_DEV printfing_ostream& identify( kat::printfing_ostream& os )
107 | {
108 | 	using namespace kat::manipulators;
109 | 	prefix_generator_type gen;
110 | 	switch(os.printing_resolution()) {
111 | 	case printfing_ostream::resolution::thread : gen = prefix_generators::self_identify< printfing_ostream::resolution::thread >; break;
112 | 	case printfing_ostream::resolution::warp   : gen = prefix_generators::self_identify< printfing_ostream::resolution::warp   >; break;
113 | 	case printfing_ostream::resolution::block  : gen = prefix_generators::self_identify< printfing_ostream::resolution::block  >; break;
114 | 	case printfing_ostream::resolution::grid   : gen = prefix_generators::self_identify< printfing_ostream::resolution::grid   >; break;
115 | 	}
116 | 	return os.set_prefix_generator(gen);
117 | }
118 | } // namespace manipulators
119 | 
120 | } // namespace linear_grid
121 | 
122 | } // namespace kat
123 | 
124 | #endif // CUDA_KAT_OSTREAM_PREFIX_GENERATORS_CUH_
125 | 


--------------------------------------------------------------------------------
/src/kat/on_device/streams/printfing_ostream.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/printfing_ostream.cuh
  3 |  *
  4 |  * @brief CUDA device-side functions for a C++-standard-library-like stream
  5 |  * whose output (eventually) gets printed using CUDA's device-side printf().
  6 |  *
  7 |  */
  8 | #pragma once
  9 | #ifndef CUDA_KAT_ON_PRINTFING_STREAM_CUH_
 10 | #define CUDA_KAT_ON_PRINTFING_STREAM_CUH_
 11 | 
 12 | #include <kat/on_device/streams/stringstream.cuh>
 13 | #include <kat/on_device/grid_info.cuh>
 14 | 
 15 | ///@cond
 16 | #include <kat/detail/execution_space_specifiers.hpp>
 17 | ///@endcond
 18 | 
 19 | namespace kat {
 20 | 
 21 | namespace manipulators {
 22 | 
 23 | using prefix_generator_type = void (*)(kat::stringstream&);
 24 | 	// TODO: Make it into a function with an std-string-like outputm, when we
 25 | 	// have an std-string like class.
 26 | 
 27 | KAT_DEV auto prefix(prefix_generator_type gen);
 28 | 
 29 | }
 30 | 
 31 | 
 32 | class printfing_ostream
 33 | {
 34 | 	static constexpr const std::size_t cout_initial_buffer_size { 1 << 8 };
 35 | 
 36 | public:
 37 | 	enum class resolution { thread, warp, block, grid };
 38 | 
 39 | 	KAT_DEV printfing_ostream(std::size_t initial_buffer_size = cout_initial_buffer_size) : main_buffer(initial_buffer_size) { }
 40 | 	KAT_DEV printfing_ostream(printfing_ostream&& other) : main_buffer(other.main_buffer) { }
 41 | 	KAT_DEV printfing_ostream(const printfing_ostream& other) : main_buffer(other.main_buffer) { }
 42 |     KAT_DEV ~printfing_ostream();
 43 | 
 44 |     // Note: You can also use strf::flush if that exists
 45 | 	KAT_DEV void flush()
 46 | 	{
 47 | 		if (not newline_on_flush and main_buffer.tellp() == 0) {
 48 | 			// Note: Returning even though we could have a prefix
 49 | 			return;
 50 | 		}
 51 | 
 52 | 		if (not should_act_for_resolution(printing_resolution_)) {
 53 | 			return;
 54 | 		}
 55 | 		if (use_prefix) {
 56 | 			// The prefix is re-generated as necessary
 57 | 			prefix_generator(prefix);
 58 | 			printf(newline_on_flush ? "%*s%*s\n" : "%*s%*s",
 59 | 				prefix.tellp(), prefix.c_str(),
 60 | 				main_buffer.tellp(), main_buffer.c_str());
 61 | 		}
 62 | 		else {
 63 | 			printf(newline_on_flush ? "%*s\n" : "%*s",
 64 | 				main_buffer.tellp(), main_buffer.c_str()
 65 | 			);
 66 | 		}
 67 | 		main_buffer.clear();
 68 | 		prefix.clear(); // We're not caching the prefix
 69 | 	}
 70 | 
 71 | protected:
 72 | 	static bool KAT_DEV should_act_for_resolution(resolution r) {
 73 | 		// TODO: It might be a better idea to check which threads in the warp/block are still active
 74 | 		// rather than assuming the first one is.
 75 | 		switch(r) {
 76 | 		case resolution::thread: return true;
 77 | 		case resolution::warp:   return grid_info::thread::is_first_in_warp();
 78 | 		case resolution::block:  return grid_info::thread::is_first_in_block();
 79 | 		case resolution::grid:   return grid_info::thread::is_first_in_grid();
 80 | 		default: return false; // but can't get here
 81 | 		}
 82 | 	}
 83 | 
 84 | public:
 85 | 	template <typename T>
 86 | 	KAT_DEV printfing_ostream& operator<<(const T& arg)
 87 | 	{
 88 | 		if (not should_act_for_resolution(printing_resolution_)) { return *this; }
 89 | 		strf::print_preview<false, false> no_preview;
 90 | 		strf::make_printer<char>(strf::rank<5>(), strf::pack(), no_preview, arg).print_to(main_buffer);
 91 | 		return *this;
 92 | 	}
 93 | 
 94 | 	// Manipulators are a clever, but confusing, idea from the C++ standard library's
 95 | 	// IO streams: They're functions which manipulate streams, but can also be made
 96 | 	// to manipulatethem by being sent to them using the << operator - which instead
 97 | 	// of actually adding any data to the stream, invokes the manipulator function.
 98 | 	//
 99 | 	using manipulator = kat::printfing_ostream& ( kat::printfing_ostream& );
100 | 
101 | 	KAT_DEV printfing_ostream& no_prefix()
102 | 	{
103 | 		use_prefix = false;
104 | 		prefix_generator = nullptr;
105 | 		prefix.clear(); // Maybe we should set it to a stringstream of size 0?
106 | 		return *this;
107 | 	}
108 | 
109 | 	KAT_DEV printfing_ostream& set_prefix_generator(manipulators::prefix_generator_type gen)
110 | 	{
111 | 		use_prefix = true;
112 | 		prefix_generator = gen;
113 | 		return *this;
114 | 	}
115 | 
116 | 	KAT_DEV printfing_ostream& no_newline_on_flush()
117 | 	{
118 | 		newline_on_flush = false;
119 | 		return *this;
120 | 	}
121 | 
122 | 	KAT_DEV printfing_ostream& append_newline_on_flush()
123 | 	{
124 | 		newline_on_flush = true;
125 | 		return *this;
126 | 	}
127 | 
128 | 	// Drops whatever's in the buffer. Also clears the prefix -
129 | 	// as that's assumed to have been resolution-related
130 | 	KAT_DEV printfing_ostream& set_printing_resolution(resolution new_resolution)
131 | 	{
132 | 		main_buffer.clear();
133 | 		no_prefix();
134 | 		printing_resolution_ = new_resolution;
135 | 		return *this;
136 | 	}
137 | 
138 | 	// Also clears the prefix - as that's assumed to have been resolution-related
139 | 	KAT_DEV resolution printing_resolution() const { return printing_resolution_; }
140 | 
141 | protected:
142 | 	kat::stringstream main_buffer;
143 | 	kat::stringstream prefix { 100 }; // { 0 };
144 | 		// no prefix by default, so why bother allocating a buffer?
145 | 		// TODO: Make this an optional along with the prefix_generator
146 | 
147 | 	bool flush_on_destruction { true };
148 | 	bool newline_on_flush { false };
149 | 
150 | 	// We may want to prefix out printing with a string which the code using cout has not explicitly specified
151 | 	// beforehand. For example: An identifier of the current thread or warp.
152 | 
153 | 	bool use_prefix { false };
154 | 		// TODO: Make this into a kat::optional<kat::stringstream> when we have an optional class,
155 | 		// and perhaps simply
156 | 	manipulators::prefix_generator_type prefix_generator { nullptr };
157 | 
158 | 
159 | 	// By default, all grid threads print; but we may want a printing only once per each warp, or block etc;
160 | 	// that resolution is controlled by this variable.
161 | 
162 | 	resolution printing_resolution_ { resolution::thread };
163 | 
164 | };
165 | 
166 | 
167 | namespace manipulators {
168 | KAT_FD kat::printfing_ostream& flush( kat::printfing_ostream& os ) { os.flush(); return os; }
169 | KAT_FD kat::printfing_ostream& endl( kat::printfing_ostream& os ) { os << '\n'; os.flush(); return os; }
170 | KAT_FD kat::printfing_ostream& no_prefix( kat::printfing_ostream& os ) { return os.no_prefix(); }
171 | KAT_FD kat::printfing_ostream& no_newline_on_flush( kat::printfing_ostream& os ) { return os.no_newline_on_flush(); }
172 | KAT_FD kat::printfing_ostream& newline_on_flush( kat::printfing_ostream& os ) { return os.append_newline_on_flush(); }
173 | 
174 | } // manipulators
175 | 
176 | 
177 | // This is defined only with __CUDA_ARCH__, since the implementation is actually device-only,
178 | // referring to this->flush(), which can only really run on the device. We could, instead,
179 | // make printfing_ostream::flush() be an STRF_HD (host-and-device) function, which simply
180 | // fails on the host side, but that would be too much of a lie.
181 | #ifdef __CUDA_ARCH__
182 | 
183 | KAT_DEV printfing_ostream::~printfing_ostream()
184 | {
185 | 	this->flush();
186 | }
187 | 
188 | #endif
189 | 
190 | template <>
191 | KAT_DEV printfing_ostream& printfing_ostream::operator<< <printfing_ostream::manipulator>(
192 | 	printfing_ostream::manipulator& manip)
193 | {
194 | 	return manip(*this);
195 | }
196 | 
197 | namespace manipulators {
198 | KAT_DEV auto prefix(prefix_generator_type gen) {
199 | 	return [gen](kat::printfing_ostream& os) { return os.set_prefix_generator(gen); };
200 | }
201 | } // namespace manipulators
202 | 
203 | // This conditional compilation segment is necessary because NVCC (10.x) will not accept a
204 | // reference-to/address-of a device function except in device function bodies, or when __CUDA_ARCH__
205 | // is defined. In other words: we have a "device-side 'using' statement" here, followed by the
206 | // operator<<() function which makes use of it.
207 | #ifdef __CUDA_ARCH__
208 | namespace manipulators {
209 | using prefix_setting_manipulator_type = std::result_of< decltype(&prefix)(prefix_generator_type) >::type;
210 | } // namespace manipulators
211 | 
212 | KAT_DEV printfing_ostream& operator<< (printfing_ostream& os, manipulators::prefix_setting_manipulator_type manip)
213 | {
214 | 	// std::basic_ostream<manipulators::prefix_setting_manipulator_type> x;
215 | 	manip(os);
216 | 	return os;
217 | }
218 | #endif
219 | 
220 | namespace manipulators {
221 | KAT_DEV auto resolution(printfing_ostream::resolution new_resolution) {
222 | 	return [new_resolution](kat::printfing_ostream& os) { return os.set_printing_resolution(new_resolution); };
223 | }
224 | } // namespace manipulators
225 | 
226 | #ifdef __CUDA_ARCH__
227 | namespace manipulators {
228 | using resolution_setting_manipulator_type = std::result_of< decltype(&resolution)(printfing_ostream::resolution) >::type;
229 | } // namespace manipulators
230 | 
231 | KAT_DEV printfing_ostream& operator<< (printfing_ostream& os, manipulators::resolution_setting_manipulator_type manip)
232 | {
233 | 	manip(os);
234 | 	return os;
235 | }
236 | #endif
237 | 
238 | 
239 | using manipulators::flush;
240 | using manipulators::endl;
241 | 
242 | } // namespace kat
243 | 
244 | #include <kat/on_device/streams/prefix_generators.cuh>
245 | 
246 | #endif // CUDA_KAT_ON_PRINTFING_STREAM_CUH_
247 | 


--------------------------------------------------------------------------------
/src/kat/on_device/streams/stringstream.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file on_device/streams/stringstream.cuh
  3 |  *
  4 |  * @brief A string stream class for CUDA device-side code (usable by individual threads).
  5 |  *
  6 |  * @note This class will likely be rather slow in use: Its code is entirely serial, and it
  7 |  * uses occasional dynamic memory allocations. You are advised to use it mostly for debugging
  8 |  * purposes.
  9 |  */
 10 | #pragma once
 11 | #ifndef CUDA_KAT_ON_DEVICE_STRINGSTREAM_CUH_
 12 | #define CUDA_KAT_ON_DEVICE_STRINGSTREAM_CUH_
 13 | 
 14 | #include <kat/on_device/builtins.cuh>
 15 | 
 16 | #include <strf.hpp>
 17 | 
 18 | // Necessary for printf()'ing in kernel code
 19 | #include <cstdio>
 20 | 
 21 | ///@cond
 22 | #include <kat/detail/execution_space_specifiers.hpp>
 23 | ///@endcond
 24 | 
 25 | namespace kat {
 26 | 
 27 | namespace detail {
 28 | 
 29 | template <typename T>
 30 | KAT_DEV T* safe_malloc(std::size_t size)
 31 | {
 32 | 	auto p = malloc(size);
 33 | 	if (p == nullptr) {
 34 | 		asm("trap;");
 35 | 	}
 36 | 	return static_cast<T*>(p);
 37 | }
 38 | 
 39 | }
 40 | 
 41 | /**
 42 |  * An std::stringstream-like class into which one can add formatted
 43 |  * data using `my_stringstream << my_datum`. It won't accept std::ios
 44 |  * flags - since we can't depend on host-side-only IOS code - but it
 45 |  * will accept a bunch of strf equivalents. See:
 46 |  *
 47 |  * @url https://robhz786.github.io/strf/doc/quick_reference.html#format_functions
 48 |  *
 49 |  * for a list of format functions.
 50 |  *
 51 |  * @note This class owns its buffer.
 52 |  * @note nothing is dynamically allocated if the length is 0
 53 |  */
 54 | class stringstream: public ::strf::basic_outbuf<char>
 55 | {
 56 | public:
 57 | 	using char_type = char;
 58 | 	// no traits_type - this is not implemented by strf
 59 | 	// using int_type = int; // really
 60 | 	// using off_type = std::off_t; // really?
 61 | 	using pos_type = std::size_t;
 62 | 
 63 | protected:
 64 | 	// Note: initial_buffer_size + 1 bytes must be allocated
 65 | 	STRF_HD stringstream(char_type* initial_buffer, std::size_t initial_buffer_size) :
 66 | 		buffer_size(initial_buffer_size),
 67 | 		buffer(initial_buffer),
 68 | 		strf::basic_outbuf<char_type>(initial_buffer, buffer_size)
 69 | 	{
 70 | 	}
 71 | 
 72 | public:
 73 | 	STRF_HD stringstream(std::size_t initial_buffer_size);
 74 | 
 75 | 	STRF_HD stringstream(stringstream&& other) : strf::basic_outbuf<char_type>(other.buffer, other.buffer_size)
 76 |     {
 77 | 		if (buffer != nullptr) {
 78 | 			free(buffer);
 79 | 		}
 80 | 		buffer = other.buffer;
 81 | 		buffer_size = other.buffer_size;
 82 | 	    other.buffer = nullptr;
 83 | 	    other.buffer_size = 0;
 84 | 	}
 85 | 
 86 | 	STRF_HD stringstream(const stringstream& other)	: stringstream(other.buffer_size)
 87 | 	{
 88 | 		memcpy(buffer, other.buffer, sizeof(char_type) * (buffer_size + 1));
 89 | 	}
 90 | 
 91 | 	STRF_HD ~stringstream()
 92 | 	{
 93 | 		if (buffer != nullptr) {
 94 | 			free(buffer);
 95 | 		}
 96 | 	}
 97 | 
 98 | 	STRF_HD void recycle() override;
 99 | 
100 | 	KAT_DEV void clear()
101 | 	{
102 | 		set_pos(buffer);
103 | 		flush();
104 | 	}
105 | 
106 | 	KAT_DEV void flush() {
107 | 		if (buffer != nullptr) {
108 | 			*pos() = '\0';
109 | 		}
110 | 	}
111 | 
112 | 	// should be able to produce an std-string-like proxy supporting a c_str() method, rather
113 | 	// than providing a c_str() directly.
114 | 
115 | 	KAT_DEV const char* c_str()
116 | 	{
117 | 		flush();
118 | 		return buffer;
119 | 	}
120 | 
121 | 	KAT_DEV pos_type tellp() const { return pos() - buffer; }
122 | 	KAT_DEV bool empty() const { return tellp() == 0; }
123 | 		// std::stringstream's don't have this
124 | 	KAT_DEV stringstream& seekp(pos_type pos) { set_pos(buffer + pos); return *this; }
125 | 
126 | 	KAT_DEV std::size_t capacity() const { return buffer_size; } // perhaps there's something else we can use instead?
127 | 
128 | 	// TO implement (maybe):
129 | 	//
130 | 	// seekp
131 | 	// tellp
132 | 	// put
133 | 	// write
134 | 	// swap <- NO.
135 | 	//
136 | 	// good
137 | 	// eof
138 | 	// fail
139 | 	// bad
140 | 	// operator!
141 | 	// operator bool
142 | 	// rdstate
143 | 	// setstate
144 | 	// copyfmt
145 | 	// fill
146 | 	// exceptions <- No exception support; but might still implement this
147 | 	// imbue <- No locale support
148 | 	// tie
149 | 	// narrow <- No locale support
150 | 	// widen <- No locale support
151 | 	//
152 | 	// flags
153 | 	// setf
154 | 	// unsetf
155 | 	// precision
156 | 	// width
157 | 	// imbue <- No locale support
158 | 	// getloc <- No locale support
159 | 	// xalloc, iword, pword <- Not relevant on the device side, I think.
160 | 	// register_callback <- ??
161 | 	// sync_with_stdio <- No
162 | 	//
163 | 
164 | 
165 | 
166 | protected:
167 | 	// TODO: Write and use a device-side unique_ptr class and use kat::unique_ptr<char_type>
168 | 	// instead of these two variables
169 | 	std::size_t buffer_size; // not including space for a trailing '\0'.
170 | 	char_type* buffer;
171 | };
172 | 
173 | #ifdef __CUDA_ARCH__
174 | 
175 | STRF_HD stringstream::stringstream(std::size_t initial_buffer_size)
176 |     : stringstream(
177 |     	initial_buffer_size == 0 ? nullptr : detail::safe_malloc<char_type>(initial_buffer_size + 1),
178 |         initial_buffer_size)
179 | {
180 | }
181 | 
182 | KAT_DEV void stringstream::recycle()
183 | {
184 | 	std::size_t used_size = (buffer_size == 0) ? 0 : (this->pos() - buffer);
185 | 	// a postcondition of recycle() is that at least so much free space is available.
186 | 	auto new_buffer_size = builtins::maximum(
187 | 		buffer_size * 2,
188 | 		used_size + strf::min_size_after_recycle<char_type>());
189 | 	auto new_buff = detail::safe_malloc<char_type>(new_buffer_size + 1);
190 | 	if (buffer != nullptr) {
191 | 		memcpy(new_buff, buffer, sizeof(char_type) * used_size);
192 | 		free(buffer);
193 | 	}
194 | 	this->set_pos(new_buff + used_size);
195 | 	this->set_end(new_buff + new_buffer_size);
196 | 	buffer = new_buff;
197 | }
198 | #endif
199 | 
200 | 
201 | template <typename T>
202 | KAT_DEV  stringstream& operator<<(stringstream& out, const T& arg)
203 | {
204 | 	if (out.capacity() == 0) {
205 | 		// We should not need to do the following. However, for some reason, make_printer(...).print_to(out)
206 | 		// will fail on empty (nullptr) buffers; so we might end up "recycle()ing" more than once for the sam
207 | 		// streaming operation.
208 | 		out.recycle();
209 | 	}
210 | 
211 | 	// TODO:
212 | 	// 1. Can `no_preview` be made constant?
213 | 	// 2. Can't we target a specific overload rather than play with ranks?
214 | 	auto no_preview = ::strf::print_preview<false, false>{};
215 | 	::strf::make_printer<char>(
216 | 		::strf::rank<5>(),
217 | 			// strf::rank is a method for controlling matching within the overload set:
218 | 			// rank objects have no members, it's only about their type. Higher rank objects can
219 | 			// match lower-rank objects (i.e. match functions in the overload sets expecting lower-rank
220 | 			// objects), which means they have access to more of the overload sets. If we create
221 | 			// a lower-rank object here we will only be able to match a few overload set members.
222 | 		::strf::pack(),
223 | 			// not modifying any facets such as digit grouping or digit separator
224 | 		no_preview,
225 | 			// Don't know what this means actually
226 | 		arg
227 | 		).print_to(out);
228 | 
229 | 	// Note: This function doesn't actually rely on out being a stringstream; any
230 | 	// ostream-like class would do. But for now, we don't have any ostreams other
231 | 	// than the stringstream, so we'll leave it this way. Later, with could either
232 | 	// have an intermediate class, or wrap basic_outbuf with an ostream class
233 | 	// without a buffer, or just call basic_outbuf an ostream
234 | 
235 | 	return out;
236 | }
237 | 
238 | } // namespace kat
239 | #endif // CUDA_KAT_ON_DEVICE_STRINGSTREAM_CUH_
240 | 
241 | 


--------------------------------------------------------------------------------
/src/kat/on_device/time.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file on_device/time.cuh
 3 |  *
 4 |  * @brief CUDA device-side functions having to do with timing and the hardware clock.
 5 |  */
 6 | 
 7 | #ifndef CUDA_KAT_ON_DEVICE_TIME_CUH_
 8 | #define CUDA_KAT_ON_DEVICE_TIME_CUH_
 9 | 
10 | #include <type_traits>
11 | 
12 | ///@cond
13 | #include <kat/detail/execution_space_specifiers.hpp>
14 | ///@endcond
15 | 
16 | namespace kat {
17 | 
18 | enum class sleep_resolution { clock_cycles, nanoseconds };
19 | 
20 | using clock_value_t = long long int;
21 | 
22 | static_assert(std::is_same< decltype(clock64()), clock_value_t>::value , "Unexpected clock function result type");
23 | 	// CUDA uses a signed type for clock values - for some unknown reason; See the declaration of clock64()
24 | 
25 | ///@cond
26 | namespace detail {
27 | 
28 | template <sleep_resolution Resolution>
29 | struct sleep_unit;
30 | 
31 | template<>  struct sleep_unit<sleep_resolution::clock_cycles> { using type = clock_value_t; };
32 | template<>  struct sleep_unit<sleep_resolution::nanoseconds > { using type = unsigned int; };
33 | 	// Why unsigned int? See the declaration of nanosleep()...
34 | 
35 | } // namespace detail
36 | ///@endcond
37 | 
38 | template <sleep_resolution Resolution>
39 | using sleep_unit_t = typename detail::sleep_unit<Resolution>::type;
40 | 
41 | 
42 | /**
43 |  * @brief Have the calling warp busy-sleep for (at least) a certain
44 |  * number of clock cycles.
45 |  *
46 |  * @note In 2017, a typical GPU clock cycle is around 1 ns (i.e. 1 GHz frequency).
47 |  *
48 |  */
49 | template <sleep_resolution Resolution = sleep_resolution::clock_cycles>
50 | KAT_DEV void sleep(sleep_unit_t<Resolution> num_cycles) = delete;
51 | 
52 | template<>
53 | KAT_DEV void sleep<sleep_resolution::clock_cycles>(
54 | 	sleep_unit_t<sleep_resolution::clock_cycles> num_cycles)
55 | {
56 | 	// The clock64() function returns an SM-specific clock ticks value,
57 | 	// which occasionally gets reset. Even if it were not reset, it would
58 | 	// only wrap around in 300 years or so since it began ticking, which is
59 | 	// why there's no need to check for wrap-around.
60 | 	// Also, it seems this code is not optimized-away despite not having
61 | 	// any obvious side effects.
62 | 	clock_value_t start = clock64();
63 |     clock_value_t cycles_elapsed;
64 |     do { cycles_elapsed = clock64() - start; }
65 |     while (cycles_elapsed < num_cycles);
66 | }
67 | 
68 | #if __CUDA_ARCH__ >= 700
69 | 
70 | template<>
71 | KAT_DEV void sleep<sleep_resolution::nanoseconds>(
72 | 	sleep_unit_t<sleep_resolution::nanoseconds> num_cycles)
73 | {
74 | 	__nanosleep(unsigned int ns);
75 | }
76 | 
77 | #endif // __CUDA_ARCH__ >= 700
78 | 
79 | } // namespace kat
80 | 
81 | #endif // CUDA_KAT_ON_DEVICE_TIME_CUH_
82 | 


--------------------------------------------------------------------------------
/src/kat/reference_wrapper.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file kat/reference_wrapper.hpp
  3 |  *
  4 |  * @brief This file implements `kat::reference_wrapper`, an equivalent of
  5 |  * C++11's `std::reference_wrapper` which may be used both in host-side and
  6 |  * CUDA-device-side code.
  7 |  */
  8 | 
  9 | //
 10 | // Original code Copyright (c) Electronic Arts Inc. All rights reserved
 11 | // Modifications Copyright (c) 2020 Eyal Rozenberg.
 12 | //
 13 | // Redistribution and use in source and binary forms, with or without
 14 | // modification, are permitted provided that the following conditions are met:
 15 | //
 16 | // 1. Redistributions of source code must retain the above copyright notice, this
 17 | //    list of conditions and the following disclaimer.
 18 | //
 19 | // 2. Redistributions in binary form must reproduce the above copyright notice,
 20 | //    this list of conditions and the following disclaimer in the documentation
 21 | //    and/or other materials provided with the distribution.
 22 | //
 23 | // 3. Neither the name of the copyright holder nor the names of its
 24 | //    contributors may be used to endorse or promote products derived from
 25 | //    this software without specific prior written permission.
 26 | //
 27 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 30 | // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 31 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 32 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 33 | // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 34 | // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 35 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 36 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 37 | //
 38 | // Note: Retrieved from https://github.com/electronicarts/EASTL/ , master branch,
 39 | // on 2020-03-11.
 40 | 
 41 | 
 42 | #ifndef CUDA_KAT_REFERENCE_WRAPPER_HPP_
 43 | #define CUDA_KAT_REFERENCE_WRAPPER_HPP_
 44 | 
 45 | #include <kat/common.hpp>
 46 | #include <kat/utility.hpp>
 47 | #include <type_traits>
 48 | 
 49 | namespace kat {
 50 | 
 51 | /// reference_wrapper
 52 | template <typename T>
 53 | class reference_wrapper
 54 | {
 55 | public:
 56 | 	typedef T type;
 57 | 
 58 | 	KAT_HD reference_wrapper(T&) noexcept;
 59 | 	KAT_HD reference_wrapper(T&&) = delete;
 60 | 	KAT_HD reference_wrapper(const reference_wrapper<T>& x) noexcept;
 61 | 
 62 | 	KAT_HD reference_wrapper& operator=(const reference_wrapper<T>& x) noexcept;
 63 | 
 64 | 	KAT_HD operator T& () const noexcept;
 65 | 	KAT_HD T& get() const noexcept;
 66 | 
 67 | 	template <typename... ArgTypes>
 68 | 	KAT_HD typename std::result_of<T&(ArgTypes&&...)>::type operator() (ArgTypes&&...) const;
 69 | 
 70 | private:
 71 | 	T* val;
 72 | };
 73 | 
 74 | template <typename T>
 75 | KAT_HD reference_wrapper<T>::reference_wrapper(T &v) noexcept
 76 | // Originally, EASTL has:
 77 | //
 78 | // : val(addressof(v))
 79 | //
 80 | // here. But we can't use std::addressof, since it is not accessible in device-side code;
 81 | // and we don't have the utility functions in <memory> implemented in device-and-host versions.
 82 | // So - we'll just inline an implementation of std::addressof() here instead
 83 | 	: val(
 84 | 		reinterpret_cast<T*>(
 85 | 			&const_cast<char&>(
 86 | 				reinterpret_cast<const volatile char&>(v)
 87 | 			)
 88 | 		)
 89 | 	)
 90 | {}
 91 | 
 92 | template <typename T>
 93 | KAT_HD reference_wrapper<T>::reference_wrapper(const reference_wrapper<T>& other) noexcept
 94 | 	: val(other.val)
 95 | {}
 96 | 
 97 | template <typename T>
 98 | KAT_HD reference_wrapper<T>& reference_wrapper<T>::operator=(const reference_wrapper<T>& other) noexcept
 99 | {
100 | 	val = other.val;
101 | 	return *this;
102 | }
103 | 
104 | template <typename T>
105 | KAT_HD reference_wrapper<T>::operator T&() const noexcept
106 | {
107 | 	return *val;
108 | }
109 | 
110 | template <typename T>
111 | KAT_HD T& reference_wrapper<T>::get() const noexcept
112 | {
113 | 	return *val;
114 | }
115 | 
116 | template <typename T>
117 | template <typename... ArgTypes>
118 | KAT_HD typename std::result_of<T&(ArgTypes&&...)>::type reference_wrapper<T>::operator() (ArgTypes&&... args) const
119 | {
120 | 	//	return std::invoke(*val, std::forward<ArgTypes>(args)...);
121 | 	return *val(std::forward<ArgTypes>(args)...);
122 | }
123 | 
124 | // reference_wrapper-specific utilties
125 | template <typename T>
126 | KAT_HD reference_wrapper<T> ref(T& t) noexcept
127 | {
128 | 	return kat::reference_wrapper<T>(t);
129 | }
130 | 
131 | template <typename T>
132 | KAT_HD void ref(const T&&) = delete;
133 | 
134 | template <typename T>
135 | KAT_HD reference_wrapper<T> ref(reference_wrapper<T>t) noexcept
136 | {
137 | 	return kat::ref(t.get());
138 | }
139 | 
140 | template <typename T>
141 | KAT_HD reference_wrapper<const T> cref(const T& t) noexcept
142 | {
143 | 	return kat::reference_wrapper<const T>(t);
144 | }
145 | 
146 | template <typename T>
147 | KAT_HD void cref(const T&&) = delete;
148 | 
149 | template <typename T>
150 | KAT_HD reference_wrapper<const T> cref(reference_wrapper<T> t) noexcept
151 | {
152 | 	return kat::cref(t.get());
153 | }
154 | 
155 | 
156 | // reference_wrapper-specific type traits
157 | template <typename T>
158 | struct is_reference_wrapper_helper
159 | 	: public std::false_type {};
160 | 
161 | template <typename T>
162 | struct is_reference_wrapper_helper<kat::reference_wrapper<T> >
163 | 	: public std::true_type {};
164 | 
165 | template <typename T>
166 | struct is_reference_wrapper
167 | 	: public kat::is_reference_wrapper_helper<typename std::remove_cv<T>::type> {};
168 | 
169 | 
170 | // Helper which adds a reference to a type when given a reference_wrapper of that type.
171 | template <typename T>
172 | struct remove_reference_wrapper
173 | 	{ typedef T type; };
174 | 
175 | template <typename T>
176 | struct remove_reference_wrapper< kat::reference_wrapper<T> >
177 | 	{ typedef T& type; };
178 | 
179 | template <typename T>
180 | struct remove_reference_wrapper< const kat::reference_wrapper<T> >
181 | 	{ typedef T& type; };
182 | 
183 | /*
184 | // reference_wrapper specializations of invoke
185 | // These have to come after reference_wrapper is defined, but reference_wrapper needs to have a
186 | // definition of invoke, so these specializations need to come after everything else has been defined.
187 | template <typename R, typename C, typename T, typename... Args>
188 | auto invoke_impl(R (C::*func)(Args...), T&& obj, Args&&... args) ->
189 | 	typename std::enable_if<is_reference_wrapper<typename std::remove_reference<T>::type>::value,
190 | 					   decltype((obj.get().*func)(std::forward<Args>(args)...))>::type
191 | {
192 | 	return (obj.get().*func)(std::forward<Args>(args)...);
193 | }
194 | 
195 | template <typename M, typename C, typename T>
196 | auto invoke_impl(M(C::*member), T&& obj) ->
197 | 	typename enable_if<is_reference_wrapper<typename remove_reference<T>::type>::value,
198 | 					   decltype(obj.get().*member)>::type
199 | {
200 | 	return obj.get().*member;
201 | }
202 | */
203 | 
204 | } // namespace kat
205 | 
206 | #endif // CUDA_KAT_REFERENCE_WRAPPER_HPP_
207 | 


--------------------------------------------------------------------------------
/src/kat/utility.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file kat/utility.hpp
  3 |  *
  4 |  * @brief An adaptation for host-and-device use of some
  5 |  * of the standard C++ library's `<utility>` code.
  6 |  */
  7 | #pragma once
  8 | #ifndef CUDA_KAT_UTILITY_HPP_
  9 | #define CUDA_KAT_UTILITY_HPP_
 10 | 
 11 | #include <kat/common.hpp>
 12 | 
 13 | #include <type_traits>
 14 | #include <utility> // Mainly so that KAT code can our header as a drop-in for <utility> itself
 15 | 
 16 | ///@cond
 17 | #include <kat/detail/execution_space_specifiers.hpp>
 18 | ///@endcond
 19 | #include <kat/detail/integer_sequence.hpp>
 20 | 
 21 | 
 22 | namespace kat {
 23 | 
 24 | #ifdef KAT_DEFINE_MOVE_AND_FORWARD
 25 | template<typename T>
 26 | constexpr KAT_FHD typename std::remove_reference<T>::type&& move(T&& v) noexcept
 27 | {
 28 | 	return static_cast<typename std::remove_reference<T>::type&&>(v);
 29 | }
 30 | 
 31 | template<typename T>
 32 | constexpr KAT_FHD T&& forward(typename std::remove_reference<T>::type& v) noexcept
 33 | {
 34 | 	return static_cast<T&&>(v);
 35 | }
 36 | 
 37 | template<typename T>
 38 | constexpr KAT_FHD T&& forward(typename std::remove_reference<T>::type&& v) noexcept
 39 | {
 40 | 	return static_cast<T&&>(v);
 41 | }
 42 | #endif
 43 | 
 44 | #if __cplusplus >= 201401L
 45 | template <typename T, typename U = T>
 46 | constexpr KAT_FHD auto exchange (T& x, U&& new_value) // TODO: A noexcept clause?
 47 | {
 48 | #ifndef KAT_DEFINE_MOVE_AND_FORWARD
 49 | 	using std::move;
 50 | 	using std::forward;
 51 | #endif
 52 | 	auto old_value = move(x);
 53 | 	x = forward<T>(new_value);
 54 | 	return old_value;
 55 | }
 56 | #endif // __cplusplus >= 201401L
 57 | 
 58 | /**
 59 |  * @brief Swap two values on the device-side, in-place.
 60 |  *
 61 |  * @note A (CUDA, or any other) compiler will often not actually
 62 |  * emit any code when this function is used. Instead, it will use
 63 |  * one argument instead of the other in later code, i.e. "swap"
 64 |  * them in its own internal figuring.
 65 |  *
 66 |  * @note  Is this enough, without the multiple specializations for std::swap?
 67 |  * @todo How does EASTL swap work? Should I incorporate its specializations?
 68 |  *
 69 |  * @note Some kat types overload this default implementation.
 70 |  *
 71 |  */
 72 | template <typename T>
 73 | KAT_FHD CONSTEXPR_SINCE_CPP_14 void swap( T& a, T& b )
 74 | 	noexcept(
 75 | 	    std::is_nothrow_move_constructible<T>::value &&
 76 | 	    std::is_nothrow_move_assignable<T>::value
 77 | 	)
 78 | {
 79 | #ifndef KAT_DEFINE_MOVE_AND_FORWARD
 80 | 	using std::move;
 81 | #endif
 82 | 	T tmp ( move(a) );
 83 | 	a = move(b);
 84 | 	b = move(tmp);
 85 | }
 86 | 
 87 | namespace detail {
 88 | 
 89 | template<class T>
 90 | struct addr_impl_ref
 91 | {
 92 | 	T& v_;
 93 | 
 94 | 	KAT_FHD addr_impl_ref( T& v ): v_( v ) {}
 95 | 	KAT_FHD operator T& () const { return v_; }
 96 | 
 97 | private:
 98 | 	KAT_FHD addr_impl_ref & operator=(const addr_impl_ref &);
 99 | };
100 | 
101 | template<class T>
102 | struct addressof_impl
103 | {
104 | 	static KAT_FHD  T* f( T& v, long ) {
105 | 		return reinterpret_cast<T*>(
106 | 			&const_cast<char&>(reinterpret_cast<const volatile char &>(v)));
107 | 	}
108 | 
109 | 	static KAT_FHD T* f( T* v, int ) { return v; }
110 | };
111 | 
112 | } // namespace detail
113 | 
114 | /**
115 |  * @brief Obtains the actual address of the object or function arg, even in presence of overloaded `operator&()`
116 |  *
117 |  * @note In the standard library, this function is somehow in @ref `<memory>`.
118 |  *
119 |  * @{
120 |  */
121 | template<class T>
122 | KAT_FHD T* addressof( T& v ) {
123 | 	// Note the complex implementation details are due to some objects
124 | 	// overloading their & operator
125 | 	return detail::addressof_impl<T>::f( detail::addr_impl_ref<T>( v ), 0 );
126 | }
127 | 
128 | /** @} */
129 | template <class T>
130 | const KAT_FHD T* addressof(const T&&) = delete;
131 | 
132 | } // namespace kat
133 | 
134 | #endif // CUDA_KAT_UTILITY_HPP_
135 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.8.2)
  2 | 
  3 | 
  4 | ###############
  5 | ##  Modules  ##
  6 | ###############
  7 | 
  8 | # Standard CMake modules
  9 | 
 10 | # Custom modules
 11 | 
 12 | include(DocTest)
 13 | 
 14 | ################
 15 | ##  Packages  ##
 16 | ################
 17 | 
 18 | find_package(CUDA 8.0 REQUIRED)
 19 | find_package(cuda-api-wrappers 0.3.0 REQUIRED)
 20 | find_package(cuda-nvtx REQUIRED) # Actually, it's sort-of required by cuda-api-wrappers
 21 | 
 22 | include(CMakeDependentOption)
 23 | CMAKE_DEPENDENT_OPTION(BUILD_PRINTING_RELATED_TESTS "Build (strf-based) printing-related tests" ON "BUILD_TESTS" ON)
 24 | set(KEEP_PTX FALSE CACHE BOOL "Keep kernel PTX files for build targets")
 25 | 
 26 | # This overcomes some linking issues I've encountered... I'm sure there's a better solution
 27 | set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_NVTX_LIBRARY} ${CUDA_cudadevrt_LIBRARY})
 28 | 
 29 | 
 30 | #############
 31 | ##  Tests  ##
 32 | #############
 33 | 
 34 | cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS_TMP Auto)
 35 | set(CUDA_ARCH_FLAGS ${CUDA_ARCH_FLAGS_TMP} CACHE STRING "CUDA gencode parameters")
 36 | string(REPLACE ";" " " CUDA_ARCH_FLAGS_STR "${CUDA_ARCH_FLAGS}")
 37 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS_STR}")
 38 | 
 39 | #add_test(
 40 | #    # Use some per-module/project prefix so that it is easier to run only tests for this module
 41 | #    NAME ${PROJECT_NAME}.failtest
 42 | #    COMMAND failtest ${TEST_RUNNER_PARAMS}
 43 | #)
 44 | ##target_set_warnings(${TEST_MAIN} ENABLE ALL AS_ERROR ALL DISABLE Annoying) # Set warnings (if needed).
 45 | #set_tests_properties(
 46 | #    ${PROJECT_NAME}.failtest
 47 | #    PROPERTIES
 48 | #        WILL_FAIL TRUE # We expect this test to fail
 49 | #)
 50 | 
 51 | add_library(test_utils util/random.cu)
 52 | set_target_properties(
 53 | 	test_utils
 54 | 	PROPERTIES
 55 | 	CXX_STANDARD 14
 56 | 	CXX_STANDARD_REQUIRED YES
 57 | 	CXX_EXTENSIONS NO
 58 | )
 59 | 
 60 | set(tests
 61 | 	array
 62 | 	shared_memory
 63 | 	math
 64 | 	shuffle
 65 | 	atomics
 66 | 	constexpr_math
 67 | 	time
 68 | 	c_string
 69 | 	span
 70 | 	miscellany
 71 | 	builtins
 72 | 	grid_collaboration
 73 | 	block_collaboration
 74 | 	warp_collaboration
 75 | 	tuple
 76 | 	sequence_ops
 77 | )
 78 | 
 79 | if (BUILD_PRINTING_RELATED_TESTS)
 80 | 	list(APPEND tests printing)
 81 | endif()
 82 | 
 83 | foreach(TEST_TARGET ${tests})
 84 | 	add_executable(${TEST_TARGET} "${TEST_TARGET}.cu")
 85 | 	target_compile_options(${TEST_TARGET} PRIVATE "--expt-relaxed-constexpr")
 86 | 	target_compile_options(${TEST_TARGET} PRIVATE "--expt-extended-lambda")
 87 | 	target_link_libraries(${TEST_TARGET} PRIVATE cuda-kat cuda-api-wrappers::cuda-api-wrappers doctest  ${CUDA_LIBRARIES} test_utils)
 88 | 	# I don't see why the following line should even be necessary. Depending on the libraries should be enough to get us their include dirs!
 89 | 	target_include_directories(${TEST_TARGET} PRIVATE ${PROJECT_SOURCE_DIR}/src)
 90 | 
 91 | 	set_target_properties(
 92 | 		${TEST_TARGET}
 93 | 		PROPERTIES
 94 | 		CXX_STANDARD 14
 95 | 		CXX_STANDARD_REQUIRED YES
 96 | 		CXX_EXTENSIONS NO
 97 | 	)
 98 | 	add_test(
 99 | 		NAME ${PROJECT_NAME}.${TEST_TARGET}
100 | 		COMMAND ${TEST_TARGET} ${TEST_RUNNER_PARAMS}
101 | 	)
102 | 	if (KEEP_PTX)
103 | 	        target_compile_options(${TEST_TARGET} PRIVATE "--keep")
104 | 	endif()
105 | endforeach(TEST_TARGET)
106 | 
107 | if (BUILD_PRINTING_RELATED_TESTS)
108 | 	target_link_libraries(printing PRIVATE strf::strf-header-only)
109 | 	target_compile_options(printing PRIVATE --ptxas-options --suppress-stack-size-warning)
110 | endif()
111 | 
112 | # TODO: Something about code coverage perhaps?
113 | 


--------------------------------------------------------------------------------
/tests/common.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUDA_KAT_TESTS_COMMON_CUH
 2 | #define CUDA_KAT_TESTS_COMMON_CUH
 3 | 
 4 | #include "util/prettyprint.hpp"
 5 | #include "util/type_name.hpp"
 6 | #include "util/random.hpp"
 7 | #include "util/miscellany.cuh"
 8 | #include "util/macro.h"
 9 | #include "util/printing.hpp"
10 | 
11 | 
12 | #include <doctest.h>
13 | #include <cuda/api_wrappers.hpp>
14 | #include <climits>
15 | #include <limits>
16 | #include <algorithm>
17 | #include <numeric>
18 | #include <cassert>
19 | #include <iostream>
20 | #include <iomanip>
21 | 
22 | #endif // CUDA_KAT_TESTS_COMMON_CUH
23 | 


--------------------------------------------------------------------------------
/tests/shared_memory.cu:
--------------------------------------------------------------------------------
  1 | #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
  2 | 
  3 | #include "common.cuh"
  4 | 
  5 | #include <kat/on_device/shared_memory.cuh>
  6 | #include <kat/on_device/atomics.cuh>
  7 | 
  8 | using namespace kat;
  9 | 
 10 | using shmem_size_t = shared_memory::size_t;
 11 | 
 12 | 
 13 | struct sizes_t {
 14 | 	shmem_size_t dynamic;
 15 | 	shmem_size_t static_;
 16 | 	shmem_size_t total;
 17 | };
 18 | 
 19 | namespace kernels {
 20 | 
 21 | 
 22 | template <shmem_size_t StaticSize>
 23 | __global__ void determine_sizes(sizes_t* results)
 24 | {
 25 | 	static  __shared__  char arr[StaticSize];
 26 | 	arr[0] = 0;
 27 | 	arr[1] = arr[0];
 28 | 	results->dynamic = shared_memory::dynamic::size();
 29 | 	results->static_ = shared_memory::static_::size();
 30 | 	results->total = shared_memory::size();
 31 | }
 32 | 
 33 | template <>
 34 | __global__ void determine_sizes<0>(sizes_t* results)
 35 | {
 36 | 	results->dynamic = shared_memory::dynamic::size();
 37 | 	results->static_ = shared_memory::static_::size();
 38 | 	results->total = shared_memory::size();
 39 | }
 40 | 
 41 | template <typename I>
 42 | __global__ void check_overlap(shmem_size_t num_elements_per_warp, shmem_size_t* num_overlaps_encountered_by_warp)
 43 | {
 44 | 	auto warp_shared_mem = shared_memory::dynamic::warp_specific::contiguous<I>(num_elements_per_warp);
 45 | 
 46 | 	// Note: The rest of this kernel will use as little as possible kat functionality, so as not
 47 | 	// to mix up testing different parts of the library. The price is some idiosyncracy. Also,
 48 | 	// we'll let just a single thread of each warp act, so as not to worry about intra-warp collaboration.
 49 | 
 50 | 	auto am_first_in_warp = (threadIdx.x % warp_size == 0);
 51 | 	if (not am_first_in_warp) { return; }
 52 | 
 53 | 	// clear the warp's shared memory
 54 | 	for(shmem_size_t i = 0; i < num_elements_per_warp; i++) { warp_shared_mem[i] = I{0}; }
 55 | 	__syncthreads();
 56 | 
 57 | 	// touch every I-element in this warp's shared memory, in a way in which overlaps between warps'
 58 | 	// shared memory stretches would be detected
 59 | 
 60 | 	for(shmem_size_t i = 0; i < num_elements_per_warp; i++) {
 61 | 		atomic::increment(&(warp_shared_mem[i]));
 62 | 	}
 63 | 	__syncthreads();
 64 | 
 65 | 	// This could have been an std::count_if
 66 | 
 67 | 	shmem_size_t num_overlaps_encountered { 0 };
 68 | 	for(shmem_size_t i = 0; i < num_elements_per_warp; i++) {
 69 | 		if (warp_shared_mem[i] != (I{1})) { num_overlaps_encountered++; }
 70 | 	}
 71 | 	auto warp_index = threadIdx.x / warp_size;
 72 | 	num_overlaps_encountered_by_warp[warp_index] = num_overlaps_encountered;
 73 | }
 74 | 
 75 | 
 76 | } // namespace kernels
 77 | 
 78 | 
 79 | TEST_SUITE("shared_memory") {
 80 | 
 81 | TEST_CASE("correctly determining static and dynamic sizes")
 82 | {
 83 | 	constexpr const shmem_size_t allocation_quantum { 256 };
 84 | 		// It seems that static shared memory is allocated in quanta; and that the dynamic shared memory
 85 | 		// can fill in the gap in the last quantum if necessary
 86 | 
 87 | 	constexpr const shmem_size_t dynamic_shmem_sizes[] = { 0, 1, allocation_quantum, allocation_quantum+1 };
 88 | 	constexpr const shmem_size_t used_static_shmem_sizes[] = { 1, allocation_quantum - 1, allocation_quantum, allocation_quantum + 1 };
 89 | 
 90 | //	Target architecture 	Shared memory allocation unit size
 91 | //	sm_2x 	128 bytes
 92 | //	sm_3x, sm_5x, sm_6x, sm_7x 	256 bytes
 93 | 
 94 | 
 95 | 	auto device { cuda::device::current::get() };
 96 | 	auto device_side_results { cuda::memory::device::make_unique<sizes_t>(device) };
 97 | 	for (auto dynamic_shared_mem_size : dynamic_shmem_sizes) {
 98 | 		// TODO: The following should really be a "for constexpr" - but that doesn't exist yet
 99 | 		for (auto i = 0; i < array_length(used_static_shmem_sizes); i++) {
100 | 			auto launch_config { cuda::make_launch_config(1, 1, dynamic_shared_mem_size) };
101 | 			sizes_t host_side_results;
102 | 			switch(i) {
103 | 			case 0: cuda::launch(kernels::determine_sizes<used_static_shmem_sizes[0]>, launch_config, device_side_results.get()); break;
104 | 			case 1: cuda::launch(kernels::determine_sizes<used_static_shmem_sizes[1]>, launch_config, device_side_results.get()); break;
105 | 			case 2: cuda::launch(kernels::determine_sizes<used_static_shmem_sizes[2]>, launch_config, device_side_results.get()); break;
106 | 			case 3: cuda::launch(kernels::determine_sizes<used_static_shmem_sizes[3]>, launch_config, device_side_results.get()); break;
107 | 			}
108 | 			auto static_shared_mem_size { used_static_shmem_sizes[i] };
109 | 			auto aligned_total_size { round_up<shmem_size_t>(static_shared_mem_size + dynamic_shared_mem_size, allocation_quantum) };
110 | 			cuda::memory::copy(&host_side_results, device_side_results.get(), sizeof(sizes_t));
111 | 			CHECK(host_side_results.dynamic == dynamic_shared_mem_size);
112 | // TODO: Figure out the exact rule for how much static shared memory is actually allocated. Apparently
113 | // it depends on the existence of other kernels (???)
114 | //			CHECK(host_side_results.static_ == aligned_total_size - dynamic_shared_mem_size);
115 | //			CHECK(host_side_results.total   == aligned_total_size);
116 | 		}
117 | 	}
118 | }
119 | 
120 | TEST_CASE_TEMPLATE("allocations of per-warp shared memory do not intersect", I, int32_t, int64_t)
121 | {
122 | 	cuda::device_t device { cuda::device::current::get() };
123 | 	auto max_shared_mem = device.properties().sharedMemPerBlock;
124 | 	auto num_warps = device.properties().max_warps_per_block();
125 | 	shmem_size_t shared_mem_per_warp = max_shared_mem / num_warps;
126 | 	shmem_size_t num_shmem_elements_per_warp = shared_mem_per_warp / sizeof(I);
127 | 	auto block_size = num_warps * warp_size;
128 | 	auto launch_config { cuda::make_launch_config(1, block_size, num_shmem_elements_per_warp * sizeof(I) * num_warps) };
129 | 	auto device_side_results { cuda::memory::device::make_unique<shmem_size_t[]>(device, num_warps) };
130 | 	auto host_side_results { std::unique_ptr<shmem_size_t[]>(new shmem_size_t[num_warps]) };
131 | 	cuda::launch(kernels::check_overlap<I>, launch_config, num_shmem_elements_per_warp, device_side_results.get());
132 | 	cuda::memory::copy(host_side_results.get(), device_side_results.get(), sizeof(shmem_size_t) * num_warps);
133 | 	auto num_overlaps_found = std::accumulate(host_side_results.get(), host_side_results.get() + num_warps, 0);
134 | 	CHECK(num_overlaps_found == 0);
135 | }
136 | 
137 | } // TEST_SUITE("shared_memory")
138 | 


--------------------------------------------------------------------------------
/tests/time.cu:
--------------------------------------------------------------------------------
  1 | #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
  2 | // #include "common.cuh"
  3 | //#include "util/prettyprint.hpp"
  4 | #include "util/type_name.hpp"
  5 | //#include "util/random.hpp"
  6 | //#include "util/miscellany.cuh"
  7 | //#include "util/macro.h"
  8 | #include <kat/on_device/collaboration/block.cuh>
  9 | #include <kat/on_device/time.cuh>
 10 | 
 11 | #include <doctest.h>
 12 | #include <cuda/api_wrappers.hpp>
 13 | 
 14 | #include <algorithm>
 15 | 
 16 | constexpr const auto num_grid_blocks {  2 };
 17 | constexpr const auto block_size      {  kat::warp_size + 1 };
 18 | 
 19 | constexpr const auto sleep_elongation_multiplicative_factor { 10000 };
 20 | 	// we want each sleep command to take a not-insignificant amount of time
 21 | 
 22 | constexpr const auto sleep_elongation_additive_factor { 10 };
 23 | 	// we want each sleep command to take a not-insignificant amount of time
 24 | 
 25 | 
 26 | namespace kernels {
 27 | 
 28 | template <kat::sleep_resolution Resolution>
 29 | __global__ void measure_time_and_sleep(
 30 | 	kat::clock_value_t* __restrict__  times_before_sleep,
 31 | 	kat::clock_value_t* __restrict__  times_after_sleep,
 32 | 	std::size_t                       total_num_threads
 33 | )
 34 | {
 35 | 	auto global_thread_id = threadIdx.x + blockIdx.x * blockDim.x;
 36 | 	if (global_thread_id >= total_num_threads) { return; }
 37 | 	auto time_before_sleep = clock64();
 38 | 	auto sleep_duration =
 39 | 		(global_thread_id + sleep_elongation_additive_factor ) * sleep_elongation_multiplicative_factor;
 40 | 	if (Resolution == kat::sleep_resolution::nanoseconds) {
 41 | #if __CUDA_ARCH__ >= 700
 42 | 		kat::sleep<kat::sleep_resolution::nanoseconds>(sleep_duration);
 43 | #else
 44 | 		// we won't break the compilation; it's up to the host-side test code to not run this.
 45 | 		asm("trap;");
 46 | #endif
 47 | 	}
 48 | 	else {
 49 | 		kat::sleep<kat::sleep_resolution::clock_cycles>(sleep_duration);
 50 | 	}
 51 | 	kat::collaborative::block::barrier();
 52 | 	auto time_after_sleep = clock64();
 53 | 	times_before_sleep[global_thread_id] = time_before_sleep;
 54 | 	times_after_sleep[global_thread_id] = time_after_sleep;
 55 | //	thread_printf("Have slept for %u units. Time before sleep = %20lld, after = %20lld",
 56 | //		(unsigned) sleep_duration, time_before_sleep, time_after_sleep);
 57 | }
 58 | 
 59 | } // namespace kernels
 60 | 
 61 | 
 62 | template <typename T, T Value>
 63 | struct value_as_type {
 64 | 	static constexpr const T value { Value };
 65 | };
 66 | 
 67 | TEST_SUITE("time") {
 68 | 
 69 | 	TEST_CASE_TEMPLATE("measure_time_and_sleep", ResolutionValueAsType,
 70 | 		value_as_type<kat::sleep_resolution, kat::sleep_resolution::clock_cycles>,
 71 | 		value_as_type<kat::sleep_resolution, kat::sleep_resolution::nanoseconds>)
 72 | 	{
 73 | 		constexpr const kat::sleep_resolution resolution { ResolutionValueAsType::value };
 74 | 
 75 | 		auto device { cuda::device::current::get() };
 76 | 			// TODO: Test shuffles with non-full warps.
 77 | 		if ((device.properties().compute_architecture().major < 7) and
 78 | 			(resolution == kat::sleep_resolution::nanoseconds))
 79 | 		{
 80 | 			// nanosecond-resolution sleep is only supported starting from Volta/Turing
 81 | 			return;
 82 | 		}
 83 | 		device.reset();
 84 | 		auto launch_config { cuda::make_launch_config(num_grid_blocks, block_size) };
 85 | 		std::size_t total_num_threads = launch_config.grid_dimensions.volume() * launch_config.block_dimensions.volume();
 86 | 		auto times_before_sleep = cuda::memory::device::make_unique<kat::clock_value_t[]>(device, total_num_threads);
 87 | 		auto times_after_sleep = cuda::memory::device::make_unique<kat::clock_value_t[]>(device, total_num_threads);
 88 | 		auto kernel = ::kernels::measure_time_and_sleep<resolution>;
 89 | 		cuda::launch(kernel, launch_config,
 90 | 			times_before_sleep.get(), times_after_sleep.get(), total_num_threads);
 91 | 		cuda::outstanding_error::ensure_none();
 92 | 		auto host_times_before_sleep = std::make_unique<kat::clock_value_t[]>(total_num_threads);
 93 | 		auto host_times_after_sleep = std::make_unique<kat::clock_value_t[]>(total_num_threads);
 94 | 		cuda::memory::copy(host_times_before_sleep.get(), times_before_sleep.get(), total_num_threads * sizeof(kat::clock_value_t));
 95 | 		cuda::memory::copy(host_times_after_sleep.get(), times_after_sleep.get(), total_num_threads * sizeof(kat::clock_value_t));
 96 | 
 97 | 		device.synchronize();
 98 | 
 99 | 		for(cuda::grid::block_dimension_t block_id = 0; block_id < num_grid_blocks; block_id++) {
100 | 
101 | 			std::vector<kat::clock_value_t> block_times_before_sleep {
102 | 				host_times_before_sleep.get() + block_id * block_size,
103 | 				host_times_before_sleep.get() + (block_id+1) * block_size
104 | 			};
105 | 			std::vector<kat::clock_value_t> block_times_after_sleep {
106 | 				host_times_after_sleep.get() + block_id * block_size,
107 | 				host_times_after_sleep.get() + (block_id+1) * block_size
108 | 			};
109 | 
110 | //			std::cout << "Resolution: "
111 | //				<< (resolution == kat::sleep_resolution::clock_cycles ? "clock_cycles" : "")
112 | //				<< (resolution == kat::sleep_resolution::nanoseconds ? "nanoseconds" : "")
113 | //				<< std::endl;
114 | 
115 | 			for(cuda::grid::dimension_t thread_index = 0; thread_index < block_size; ++thread_index) {
116 | 				CHECK(block_times_before_sleep[thread_index] < block_times_after_sleep[thread_index]);
117 | //				std::cout
118 | //					<< "Block " << std::setw(4) << block_id << ", Thread " << std::setw(4) << thread_index << ": "
119 | //					<< "Before sleep: " << std::setw(20) << tbs[thread_index] << ' '
120 | //					<< "After sleep: "  << std::setw(20) << tas[thread_index] << std::endl;
121 | 			}
122 | 
123 | 			auto max_time_before_sleep = *std::max_element(block_times_before_sleep.begin(), block_times_before_sleep.end());
124 | 			auto min_time_after_sleep = *std::min_element(block_times_after_sleep.begin(), block_times_after_sleep.end());
125 | 			CHECK_LT(max_time_before_sleep, min_time_after_sleep);
126 | 
127 | //			std::cout
128 | //				<< " Max time before sleep: " << std::setw(20) << max_time_before_sleep
129 | //				<< " Min time after sleep: " << std::setw(20) << min_time_after_sleep << std::endl;
130 | 		}
131 | 	}
132 | 
133 | } // TEST_SUITE("time")
134 | 


--------------------------------------------------------------------------------
/tests/util/cpu_builtin_equivalents.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CUDA_KAT_TEST_UTIL_CPU_BUILTIN_EQUIVALENTS_HPP_
  2 | #define CUDA_KAT_TEST_UTIL_CPU_BUILTIN_EQUIVALENTS_HPP_
  3 | 
  4 | #include <type_traits>
  5 | 
  6 | 
  7 | template <typename I>
  8 | constexpr inline I absolute_value(I x)
  9 | {
 10 | 	static_assert(std::is_integral<I>::value, "Only to be used for integral types");
 11 | 	return x > 0 ? x : I(-x);
 12 | }
 13 | 
 14 | template <> constexpr inline float  absolute_value<float >(float  x) { return std::abs(x); }
 15 | template <> constexpr inline double absolute_value<double>(double x) { return std::abs(x); }
 16 | 
 17 | namespace detail {
 18 | template <typename I>
 19 | constexpr inline std::make_unsigned_t<I> absolute_difference(std::false_type, I x, I y)
 20 | {
 21 | 	// unsigned case
 22 | 	return x < y ? y-x : x-y;
 23 | }
 24 | 
 25 | template <typename I>
 26 | constexpr inline std::make_unsigned_t<I> absolute_difference(std::true_type, I x, I y)
 27 | {
 28 | 	// signed case
 29 | 
 30 | 	auto have_same_sign = (x > 0) == (y > 0);
 31 | 	if (have_same_sign) {
 32 | 		return x < y ? y-x : x-y;
 33 | 	}
 34 | 	using uint_t = std::make_unsigned_t<I>;
 35 | 	return x < y ?
 36 | 		uint_t(-x) + uint_t(y) :
 37 | 		uint_t(x) + uint_t(-y);
 38 | }
 39 | 
 40 | } // namespace detail
 41 | 
 42 | // This may be a a poor implementation, don't use it elsewhere
 43 | template <typename I>
 44 | constexpr inline std::make_unsigned_t<I> absolute_difference(I x, I y)
 45 | {
 46 | 	static_assert(std::is_integral<I>::value, "Only to be used for integral types");
 47 | 	using is_signed = std::integral_constant<bool, std::is_signed<I>::value>;
 48 | 	return detail::absolute_difference<I>(is_signed{}, x, y);
 49 | }
 50 | 
 51 | 
 52 | 
 53 | template <typename I> int population_count(I x)
 54 | {
 55 | 	static_assert(std::is_integral<I>::value, "Only integral types are supported");
 56 | 	static_assert(sizeof(I) <= sizeof(unsigned long long), "Unexpectedly large type");
 57 | 
 58 | 	using native_popc_type =
 59 | 		typename std::conditional<
 60 | 			sizeof(I) <= sizeof(unsigned),
 61 | 			unsigned,
 62 | 			unsigned long long
 63 | 		>::type;
 64 | 	return population_count<native_popc_type>(static_cast<native_popc_type>(x));
 65 | }
 66 | 
 67 | template <typename I> int population_count(I x);
 68 | 
 69 | template<> inline int population_count<unsigned>(unsigned x) { return __builtin_popcount(x); }
 70 | template<> inline int population_count<unsigned long>(unsigned long x) { return __builtin_popcountl(x); }
 71 | template<> inline int population_count<unsigned long long>(unsigned long long x) { return __builtin_popcountll(x); }
 72 | 
 73 | template <typename I> inline I bit_reverse(I x)
 74 | {
 75 | 	static_assert(std::is_integral<I>::value and sizeof(I) <= 8, "bit_reverse is only available for integers with 64 bits or less");
 76 | 	switch(sizeof(I)) {
 77 | 	case 1:  return bit_reverse<uint8_t>(reinterpret_cast<uint8_t&>(x));
 78 | 	case 2:  return bit_reverse<uint16_t>(reinterpret_cast<uint16_t&>(x));
 79 | 	case 4:  return bit_reverse<uint32_t>(reinterpret_cast<uint32_t&>(x));
 80 | 	default: return bit_reverse<uint64_t>(reinterpret_cast<uint64_t&>(x));
 81 | 	}
 82 | }
 83 | 
 84 | template <>
 85 | inline uint8_t bit_reverse(uint8_t x)
 86 | {
 87 | 	static unsigned char lookup[16] = {
 88 | 		0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
 89 | 		0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
 90 | 	};
 91 | 
 92 | 	// Reverse top half, reverse lower half, and swap them.
 93 | 	return (lookup[x & 0b1111] << 4) | lookup[x >> 4];
 94 | }
 95 | 
 96 | template <>
 97 | inline uint16_t bit_reverse(uint16_t x)
 98 | {
 99 | 	return (bit_reverse<uint8_t>(x & 0xFF) << 8) | bit_reverse<uint8_t>(x >> 8);
100 | }
101 | 
102 | 
103 | template <>
104 | inline uint32_t bit_reverse(uint32_t x)
105 | {
106 | 	return (bit_reverse<uint16_t>(x & 0xFFFF) << 16) | bit_reverse<uint16_t>(x >> 16);
107 | }
108 | 
109 | template <>
110 | inline uint64_t bit_reverse(uint64_t x)
111 | {
112 | 	return (uint64_t{bit_reverse<uint32_t>(x & 0xFFFFFFFF)} << 32) | bit_reverse<uint32_t>(x >> 32);
113 | }
114 | 
115 | 
116 | 
117 | #endif // CUDA_KAT_TEST_UTIL_CPU_BUILTIN_EQUIVALENTS_HPP_
118 | 


--------------------------------------------------------------------------------
/tests/util/macro.h:
--------------------------------------------------------------------------------
  1 | #ifndef TESTS_UTIL_MACRO_H_
  2 | #define TESTS_UTIL_MACRO_H_
  3 | 
  4 | 
  5 | #if defined(__GNUC__) && __GNUC__ >= 4
  6 | #ifndef UNLIKELY
  7 | #define LIKELY(x)   (__builtin_expect((x), 1))
  8 | #define UNLIKELY(x) (__builtin_expect((x), 0))
  9 | #endif /* UNLIKELY */
 10 | #else /* defined(__GNUC__) && __GNUC__ >= 4 */
 11 | #ifndef UNLIKELY
 12 | #define LIKELY(x)   (x)
 13 | #define UNLIKELY(x) (x)
 14 | #endif /* UNLIKELY */
 15 | #endif /* defined(__GNUC__) && __GNUC__ >= 4 */
 16 | 
 17 | #ifndef UNUSED
 18 | #define UNUSED(x) (void) x
 19 | #endif
 20 | 
 21 | #define EXPAND(_x)                          _x
 22 | #define QUOTE(_q)                           #_q
 23 | #define STRINGIZE(_q)                       #_q
 24 | 
 25 | #ifndef CONCATENATE
 26 | #define CONCATENATE( s1, s2 )               s1 ## s2
 27 | #define EXPAND_THEN_CONCATENATE( s1, s2 )   CONCATENATE( s1, s2 )
 28 | #endif /* CONCATENATE */
 29 | 
 30 | #define AS_SINGLE_ARGUMENT(...)             __VA_ARGS__
 31 | 
 32 | /**
 33 |  * This macro expands into a different identifier in every expansion.
 34 |  * Note that you _can_ clash with an invocation of UNIQUE_IDENTIFIER
 35 |  * by manually using the same identifier elsewhere; or by carefully
 36 |  * choosing another prefix etc.
 37 |  */
 38 | #ifdef __COUNTER__
 39 | #define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __COUNTER__)
 40 | #else
 41 | #define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __LINE__)
 42 | #endif /* COUNTER */
 43 | 
 44 | #define COUNT_THIS_LINE static_assert(__COUNTER__ + 1, "");
 45 | #define START_COUNTING_LINES(count_name) enum { EXPAND_THEN_CONCATENATE(count_name,_start) = __COUNTER__ };
 46 | #define FINISH_COUNTING_LINES(count_name) enum { count_name = __COUNTER__ - EXPAND_THEN_CONCATENATE(count_name,_start) - 1 };
 47 | 
 48 | 
 49 | ///**
 50 | // * This macro expands into a different identifier in every expansion.
 51 | // * Note that you _can_ clash with an invocation of UNIQUE_IDENTIFIER
 52 | // * by manually using the same identifier elsewhere; or by carefully
 53 | // * choosing another prefix etc.
 54 | // */
 55 | //#ifdef __COUNTER__
 56 | //#define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __COUNTER__)
 57 | //#else
 58 | //#define UNIQUE_IDENTIFIER(prefix) EXPAND_THEN_CONCATENATE(prefix, __LINE__)
 59 | //#endif /* COUNTER */
 60 | 
 61 | 
 62 | /**
 63 |  * Map macro - applying an arbitrary macro to multiple arguments;
 64 |  * based on the discussion and William Swanson's suggestion here:
 65 |  * http://stackoverflow.com/q/6707148/1593077
 66 |  *
 67 |  * Usage example:
 68 |  *
 69 |  *   #define DO_SOMETHING(x) char const *x##_string = #x;
 70 |  *   MAP(DO_SOMETHING, foo, bar, baz)
 71 |  *
 72 |  * will expand to
 73 |  *
 74 |  *    char const *foo_string = "foo";
 75 |  *    char const *bar_string = "bar";
 76 |  *    char const *baz_string = "baz";
 77 |  *
 78 |  */
 79 | 
 80 | #define EVAL0(...) __VA_ARGS__
 81 | #define EVAL1(...) EVAL0 (EVAL0 (EVAL0 (__VA_ARGS__)))
 82 | #define EVAL2(...) EVAL1 (EVAL1 (EVAL1 (__VA_ARGS__)))
 83 | #define EVAL3(...) EVAL2 (EVAL2 (EVAL2 (__VA_ARGS__)))
 84 | #define EVAL4(...) EVAL3 (EVAL3 (EVAL3 (__VA_ARGS__)))
 85 | #define EVAL(...)  EVAL4 (EVAL4 (EVAL4 (__VA_ARGS__)))
 86 | 
 87 | #define MAP_END(...)
 88 | #define MAP_OUT
 89 | 
 90 | #define MAP_GET_END() 0, MAP_END
 91 | #define MAP_NEXT0(test, next, ...) next MAP_OUT
 92 | #define MAP_NEXT1(test, next) MAP_NEXT0 (test, next, 0)
 93 | #define MAP_NEXT(test, next)  MAP_NEXT1 (MAP_GET_END test, next)
 94 | 
 95 | /**
 96 |  *  Use the third of these macros to apply a unary macro to all other arguments
 97 |  * passed, e.g.
 98 |  *
 99 |  *   #define MY_UNARY(x)   call_foo(x, 123)
100 |  *   MAP(MY_UNARY, 456, 789);
101 |  *
102 |  * will expand to
103 |  *
104 |  *   call_foo(456, 123);
105 |  *   call_foo(789, 123);
106 |  *
107 |  */
108 | #define MAP0(f, x, peek, ...) f(x) MAP_NEXT (peek, MAP1) (f, peek, __VA_ARGS__)
109 | #define MAP1(f, x, peek, ...) f(x) MAP_NEXT (peek, MAP0) (f, peek, __VA_ARGS__)
110 | #define MAP(f, ...) EVAL (MAP1 (f, __VA_ARGS__, (), 0))
111 | 
112 | /**
113 |  * Same as MAP/MAP1/MAP0, but used for macros with pairs of arguments, and
114 |  * specifying the first one
115 |  */
116 | #define MAP_BINARY0(f, fixed_arg, x, peek, ...) f(fixed_arg, x) MAP_NEXT (peek, MAP_BINARY1) (f, fixed_arg, peek, __VA_ARGS__)
117 | #define MAP_BINARY1(f, fixed_arg, x, peek, ...) f(fixed_arg, x) MAP_NEXT (peek, MAP_BINARY0) (f, fixed_arg, peek, __VA_ARGS__)
118 | #define MAP_BINARY(f, fixed_arg, ...) EVAL (MAP_BINARY1 (f, fixed_arg, __VA_ARGS__, (), 0))
119 | 
120 | /**
121 |  * Same as MAP/MAP1/MAP0, but used for macros with triplets of arguments, and
122 |  * specifying the first and second ones
123 |  */
124 | #define MAP_TRINARY0(f, first_fixed_arg, second_fixed_arg, x, peek, ...) f(first_fixed_arg, second_fixed_arg, x) MAP_NEXT (peek, MAP_TRINARY1) (f, first_fixed_arg, second_fixed_arg, peek, __VA_ARGS__)
125 | #define MAP_TRINARY1(f, first_fixed_arg, second_fixed_arg, x, peek, ...) f(first_fixed_arg, second_fixed_arg, x) MAP_NEXT (peek, MAP_TRINARY0) (f, first_fixed_arg, second_fixed_arg, peek, __VA_ARGS__)
126 | #define MAP_TRINARY(f, first_fixed_arg, second_fixed_arg, ...) EVAL (MAP_TRINARY1 (f, first_fixed_arg, second_fixed_arg, __VA_ARGS__, (), 0))
127 | 
128 | /**
129 |  * Compile a different piece of code based on compile-time evaluation of a condition;
130 |  * the condition must evaluate to 1 or to 0, exactly, or this will fail.
131 |  *
132 |  * Usage:
133 |  *
134 |  *   IF_ELSE( GCC_VERSION > 4 )(code in case condition holds)(code in case condition fails)
135 |  */
136 | 
137 | #define IF_ELSE(condition) _IF_ ## condition
138 | #define _IF_1(...) __VA_ARGS__ _IF_1_ELSE
139 | #define _IF_0(...)             _IF_0_ELSE
140 | 
141 | #define _IF_1_ELSE(...)
142 | #define _IF_0_ELSE(...) __VA_ARGS__
143 | 
144 | /**
145 |  * Use this macro to instantiate tests for all integer types.
146 |  */
147 | #define INTEGER_TYPES \
148 | 	char, short, int, long, long long, \
149 | 	unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long
150 | 
151 | // These:
152 | //
153 | //  signed char, signed short, signed int, signed long, signed long long
154 | //
155 | // are the same as:
156 | //
157 | //   char, short, int, long, long long
158 | //
159 | // and these should be covered by the native types:
160 | //
161 | //   int8_t, int16_t, int32_t, int64_t,
162 | //   uint8_t, uint16_t, uint32_t, uint64_t
163 | //
164 | // so the above should be sufficient
165 | 
166 | 
167 | 
168 | /**
169 |  * Use this macro to instantiate tests for all floating-point types.
170 |  */
171 | #define FLOAT_TYPES float, double
172 | 
173 | #define ARRAY_TYPES_BY_SIZE  \
174 | 	kat::array<uint8_t, 4>, \
175 | 	kat::array<uint8_t, 7>, \
176 | 	kat::array<uint8_t, 8>, \
177 | 	kat::array<uint8_t, 9>, \
178 | 	kat::array<uint8_t, 15>, \
179 | 	kat::array<uint8_t, 16>, \
180 | 	kat::array<uint8_t, 17>, \
181 | 	kat::array<uint8_t, 31>, \
182 | 	kat::array<uint8_t, 32>, \
183 | 	kat::array<uint8_t, 33>
184 | 
185 | #define debug_print(x) do { std::cout << STRINGIZE(x) << " = " << x << std::endl; } while(0);
186 | 
187 | 
188 | 
189 | #endif // TESTS_UTIL_MACRO_H_
190 | 


--------------------------------------------------------------------------------
/tests/util/miscellany.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef CUDA_KAT_TEST_MISC_UTILITIES_CUH_
  2 | #define CUDA_KAT_TEST_MISC_UTILITIES_CUH_
  3 | 
  4 | #include <cuda/api_wrappers.hpp>
  5 | #include <doctest.h>
  6 | 
  7 | #include <algorithm>
  8 | #include <climits>
  9 | #include <type_traits>
 10 | #include <iterator>
 11 | #include <utility>
 12 | 
 13 | using fake_bool = int8_t; // so as not to have trouble with vector<bool>
 14 | static_assert(sizeof(bool) == sizeof(fake_bool), "unexpected size mismatch");
 15 | 
 16 | 
 17 | template <typename I>
 18 | constexpr inline I round_up(I x, I quantum) { return (x % quantum) ? (x + (quantum - (x % quantum))) : x; }
 19 | 
 20 | template <typename I>
 21 | constexpr inline I round_down(I x, I quantum) { return x - x % quantum; }
 22 | 
 23 | template <typename T, std::size_t Length>
 24 | constexpr inline std::size_t array_length(const T(&ref)[Length]) { return Length; }
 25 | 
 26 | // Should be constexpr - but only beginning in C++20
 27 | template< class InputIt>
 28 | bool inline all_of( InputIt first, InputIt last)
 29 | {
 30 | 	static_assert(std::is_same<typename std::iterator_traits<InputIt>::value_type, bool>::value, "This function is intended for boolean-valued sequences only");
 31 | 	return std::all_of(first, last, [](bool b) { return b; });
 32 | }
 33 | 
 34 | // Should be constexpr - but only beginning in C++20
 35 | template<class Container>
 36 | bool all_of(const Container& c)
 37 | {
 38 | 	static_assert(std::is_same<typename Container::value_type, bool>::value, "This function is intended for boolean-valued sequences only");
 39 | 	return std::all_of(std::cbegin(c), std::cend(c), [](bool b) { return b; });
 40 | }
 41 | 
 42 | // Code for is_iterator lifted from:
 43 | // https://stackoverflow.com/a/12032923/1593077
 44 | template<typename T, typename = void>
 45 | struct is_iterator
 46 | {
 47 |    static constexpr bool value = false;
 48 | };
 49 | 
 50 | template<typename T>
 51 | struct is_iterator<T, typename std::enable_if<!std::is_same<typename std::iterator_traits<T>::value_type, void>::value>::type>
 52 | {
 53 |    static constexpr bool value = true;
 54 | };
 55 | 
 56 | /**
 57 |  * Use these next few types to make assertions regarding each member
 58 |  * of a template parameter pack, e.g.
 59 |  *
 60 |  *  static_assert(all_true<(Numbers == 0 || Numbers == 1)...>::value, "");
 61 |  *
 62 |  */
 63 | template<bool...> struct bool_pack;
 64 | template<bool... bs>
 65 | using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
 66 | 
 67 | template <typename T>
 68 | constexpr inline std::size_t size_in_bits() { return sizeof(T) * CHAR_BIT; }
 69 | template <typename T>
 70 | constexpr inline std::size_t size_in_bits(const T&) { return sizeof(T) * CHAR_BIT; }
 71 | 
 72 | 
 73 | /**
 74 | * Divides the left-hand-side by the right-hand-side, rounding up
 75 | * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
 76 | *
 77 | * @param dividend the number to divide
 78 | * @param divisor the number of by which to divide
 79 | * @return The least integer multiple of {@link divisor} which is greater-or-equal to
 80 | * the non-integral division dividend/divisor.
 81 | *
 82 | * @note sensitive to overflow, i.e. if dividend > std::numeric_limits<S>::max() - divisor,
 83 | * the result will be incorrect
 84 | */
 85 | template <typename S, typename T>
 86 | constexpr inline S div_rounding_up(const S& dividend, const T& divisor) {
 87 | 	return (dividend + divisor - 1) / divisor;
 88 | /*
 89 | 	std::div_t div_result = std::div(dividend, divisor);
 90 | 	return div_result.quot + !(!div_result.rem);
 91 | */
 92 | }
 93 | 
 94 | // C++14 version of [[maybe_unused]] ...
 95 | template <typename T>
 96 | inline void ignore(T &&) { }
 97 | 
 98 | namespace doctest {
 99 | 
100 | const char* current_test_name() { return doctest::detail::g_cs->currentTest->m_name; }
101 | 
102 | } // namespace doctest
103 | 
104 | // #ifdef __GNUC__
105 | template <typename T>
106 | [[gnu::warning("Artificial warning to print a type name - please ignore")]]
107 | inline void print_type() noexcept { return; }
108 | 
109 | template <typename T>
110 | [[gnu::warning("Artificial warning to print a type name - please ignore")]]
111 | inline void print_type_of(T&& x) noexcept{ return; }
112 | // #endif
113 | 
114 | namespace kernels {
115 | 
116 | template <typename T, typename Size>
117 | __global__ void fill(T* buffer, T value, Size length)
118 | {
119 | 	// essentially, grid-level fill
120 | 	Size num_grid_threads = blockDim.x * gridDim.x;
121 | 	for(Size pos = threadIdx.x + blockIdx.x * blockDim.x;
122 | 		pos < length;
123 | 		pos += num_grid_threads)
124 | 	{
125 | 		buffer[pos] = value;
126 | 	}
127 | }
128 | 
129 | }
130 | 
131 | cuda::launch_configuration_t
132 | make_busy_config(cuda::device_t& device) {
133 | 	auto prop = device.properties();
134 | 	auto sm_busy_factor = 2;
135 | 	auto num_blocks = prop.multiProcessorCount * sm_busy_factor;
136 | 	auto block_busy_factor = 4; // probably not the right number
137 | 	auto num_threads_per_block = cuda::warp_size * block_busy_factor;
138 | 	return cuda::make_launch_config(num_blocks, num_threads_per_block);
139 | }
140 | 
141 | inline constexpr cuda::launch_configuration_t single_thread_launch_config() noexcept
142 | {
143 | 	return { cuda::grid::dimensions_t::point(), cuda::grid::dimensions_t::point() };
144 | }
145 | 
146 | // Poor man's addressof
147 | template <typename T>
148 | T* addressof(T& arg)
149 | {
150 | 	return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg)));
151 | }
152 | 
153 | 
154 | 
155 | #endif /* CUDA_KAT_TEST_MISC_UTILITIES_CUH_ */
156 | 


--------------------------------------------------------------------------------
/tests/util/poor_mans_constexpr_string.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CUDA_KAT_TEST_POOR_MANS_CONSTEXPR_STRING_HPP_
 2 | #define CUDA_KAT_TEST_POOR_MANS_CONSTEXPR_STRING_HPP_
 3 | 
 4 | #include <cstddef>
 5 | #include <stdexcept>
 6 | #include <cstring>
 7 | #include <ostream>
 8 | 
 9 | ///@cond
10 | #include <kat/detail/execution_space_specifiers.hpp>
11 | ///@endcond
12 | 
13 | #  if __cplusplus < 201103
14 | #    error "C++11 or later required"
15 | #  elif __cplusplus < 201402
16 | #    define CONSTEXPR14_TN
17 | #  else
18 | #    define CONSTEXPR14_TN constexpr
19 | #  endif
20 | 
21 | namespace util {
22 | 
23 | class constexpr_string
24 | {
25 |     const char* const p_;
26 |     const std::size_t sz_;
27 | 
28 | public:
29 |     typedef const char* const_iterator;
30 | 
31 |     template <std::size_t N>
32 |     constexpr KAT_FHD constexpr_string(const char(&a)[N]) noexcept
33 |         : p_(a)
34 |         , sz_(N-1)
35 |         {}
36 | 
37 |     constexpr KAT_FHD constexpr_string(const char* p, std::size_t N) noexcept
38 |         : p_(p)
39 |         , sz_(N)
40 |         {}
41 | 
42 |     constexpr KAT_FHD const char* data() const noexcept {return p_;}
43 |     constexpr KAT_FHD std::size_t size() const noexcept {return sz_;}
44 | 
45 |     constexpr KAT_FHD const_iterator begin() const noexcept {return p_;}
46 |     constexpr KAT_FHD const_iterator end()   const noexcept {return p_ + sz_;}
47 | 
48 |     constexpr KAT_FHD char operator[](std::size_t n) const
49 |     {
50 |     	return n < sz_ ? p_[n] :
51 | #ifdef __CUDA_ARCH__
52 |     		0;
53 | #else
54 |             throw std::out_of_range("constexpr_string");
55 | #endif
56 |     }
57 | };
58 | 
59 | KAT_FHD
60 | std::ostream&
61 | operator<<(std::ostream& os, constexpr_string const& s)
62 | {
63 |     return os.write(s.data(), s.size());
64 | }
65 | 
66 | } // namespace util
67 | #include <kat/detail/execution_space_specifiers.hpp>
68 | 
69 | #endif // CUDA_KAT_TEST_POOR_MANS_CONSTEXPR_STRING_HPP_
70 | 


--------------------------------------------------------------------------------
/tests/util/printing.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CUDA_KAT_TEST_UTILS_PRINTING_HPP_
 2 | #define CUDA_KAT_TEST_UTILS_PRINTING_HPP_
 3 | 
 4 | #include <string>
 5 | #include <ostream>
 6 | #include <cuda/api_wrappers.hpp>
 7 | 
 8 | namespace detail {
 9 | template <typename ToBeStreamed>
10 | struct promoted_for_streaming { using type = ToBeStreamed; };
11 | template<> struct promoted_for_streaming<char>{ using type = short; };
12 | template<> struct promoted_for_streaming<signed char>{ using type = signed short; };
13 | template<> struct promoted_for_streaming<unsigned char> { using type = unsigned short; };
14 | 
15 | } // namespace detail
16 | /*
17 |  * The following structs are used for streaming data to iostreams streams.
18 |  * They have a tendency to try to outsmart you, e.g. w.r.t. char or unsigned
19 |  *  char data - they assume you're really passing ISO-8859-1 code points
20 |  *  rather than integral values, and will print accordingly. Using this
21 |  *  generic promoter, you van avoid that.
22 |  */
23 | template <typename ToBeStreamed>
24 | typename detail::promoted_for_streaming<ToBeStreamed>::type promote_for_streaming(const ToBeStreamed& tbs)
25 | {
26 | 	return static_cast<typename detail::promoted_for_streaming<ToBeStreamed>::type>(tbs);
27 | }
28 | 
29 | inline const char* ordinal_suffix(int n)
30 | {
31 | 	static const char suffixes [4][5] = {"th", "st", "nd", "rd"};
32 | 	auto ord = n % 100;
33 | 	if (ord / 10 == 1) { ord = 0; }
34 | 	ord = ord % 10;
35 | 	return suffixes[ord > 3 ? 0 : ord];
36 | }
37 | 
38 | // cuda-api-wrappers-related utilities
39 | 
40 | template <typename N = int>
41 | inline std::string xth(N n) { return std::to_string(n) + ordinal_suffix(n); }
42 | 
43 | std::ostream& operator<<(std::ostream& os, cuda::grid::dimensions_t dims)
44 | {
45 | 	return os << '(' << dims.x << "," << dims.y << "," << dims.z << ')';
46 | }
47 | 
48 | std::ostream& operator<<(std::ostream& os, cuda::launch_configuration_t lc)
49 | {
50 | 	return os
51 | 		<< "grid x block dimensions = " << lc.grid_dimensions << " x " << lc.block_dimensions << ", "
52 | 		<< lc.dynamic_shared_memory_size << " bytes dynamic shared memory" << '\n';
53 | }
54 | 
55 | #ifdef __SIZEOF_INT128__
56 | 
57 | // always in hex!
58 | 
59 | std::ostream& operator<<(std::ostream& os, __uint128_t x)
60 | {
61 | 	return os << "uint128_t{" << uint64_t(x >> 64) << uint64_t(x & ~uint64_t{0}) << '}';
62 | }
63 | 
64 | std::ostream& operator<<(std::ostream& os, __int128_t x)
65 | {
66 | 	auto sign = x < 0 ? '-' : ' ';
67 | 	auto magnitude = x < 0 ? -x : x ;
68 | 	return os << "int128_t{" << sign << uint64_t(magnitude >> 64) << uint64_t(magnitude & ~uint64_t{0}) << '}';
69 | }
70 | #endif
71 | 
72 | 
73 | #endif // CUDA_KAT_TEST_UTILS_PRINTING_HPP_
74 | 


--------------------------------------------------------------------------------
/tests/util/random.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "random.hpp"
 3 | 
 4 | namespace util {
 5 | namespace random {
 6 | std::random_device device;  // Note this is a callable object.
 7 | std::default_random_engine engine(device());
 8 | } // namespace random
 9 | } // namespace util
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/tests/util/random.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #ifndef CUDA_KAT_TEST_UTILITIES_RANDOM_H_
  3 | #define CUDA_KAT_TEST_UTILITIES_RANDOM_H_
  4 | 
  5 | /************************************************************
  6 |  *
  7 |  * Simplistic and non-thread-safe random number generation
  8 |  * convenience utility - based on the C++ standard library.
  9 |  *
 10 |  * If you need to do something serious with random numbers,
 11 |  * dont use this; if you just want a bunch of random-looking
 12 |  * numbers quick & dirty, do use it.
 13 |  *
 14 |  ************************************************************/
 15 | 
 16 | #include <random>
 17 | #include <algorithm>
 18 | #include <iterator>
 19 | #include <type_traits>
 20 | #include <unordered_set>
 21 | 
 22 | namespace util {
 23 | 
 24 | namespace random {
 25 | 
 26 | 
 27 | extern std::random_device device;  // Note this is a callable object.
 28 | extern std::default_random_engine engine;
 29 | 
 30 | using result_t = decltype(engine)::result_type;
 31 | using seed_t   = result_t;
 32 | 
 33 | template <typename T>
 34 | using uniform_distribution = std::conditional_t<
 35 | 	std::is_floating_point<T>::value,
 36 | 	std::uniform_real_distribution<T>,
 37 | 	std::uniform_int_distribution<T>
 38 | >;
 39 | 
 40 | 
 41 | 
 42 | /*
 43 | // TODO: Does the distribution object actually remain constant? I wonder.
 44 | // Should I return an rvalue reference?
 45 | template <typename Distribution>
 46 | inline typename Distribution::result_type sample_from(Distribution& distribution) {
 47 | 	return distribution(engine);
 48 | }
 49 | */
 50 | 
 51 | template <typename Distribution, typename Engine = std::default_random_engine>
 52 | inline typename Distribution::result_type sample_from(
 53 | 	Distribution&  distribution,
 54 | 	Engine&        engine = util::random::engine)
 55 | {
 56 | 	return distribution(engine);
 57 | }
 58 | 
 59 | inline void seed(const seed_t& seed_value)
 60 | {
 61 | 	engine.seed(seed_value);
 62 | }
 63 | 
 64 | /* In your code, do something like:
 65 | 
 66 | 	const int rangeMin = 1;
 67 | 	const int rangeMax = 10;
 68 | 	std::uniform_int_distribution<uint32_t> distribution(rangeMin, rangeMax);
 69 | 	// util::random::seed(std::time(0)); // seed with the current time
 70 | 	auto a = util::random::sample_from(distribution);
 71 | 	cout << "A random integer between " << rangeMin << "and " << " for you: "
 72 | 		  << util::random::sample_from(distribution) << '\n';
 73 | 
 74 | */
 75 | 
 76 | // Some more examples of distributions:
 77 | //std::uniform_int_distribution<uint32_t> uint_dist;         // by default range [0, MAX]
 78 | //std::uniform_int_distribution<uint32_t> uint_dist10(0,10); // range [0,10]
 79 | //std::normal_distribution<double> normal_dist(mean, stddeviation);  // N(mean, stddeviation)
 80 | 
 81 | template <typename ForwardIt, typename Distribution, typename Engine = std::default_random_engine>
 82 | constexpr inline void generate(
 83 | 	ForwardIt first,
 84 | 	ForwardIt last,
 85 | 	Distribution& distribution,
 86 | 	Engine& engine = util::random::engine)
 87 | {
 88 | 	// If we could rely on having C++17, we could generate in parallel...
 89 | 	std::generate(first, last, [&distribution, &engine]() {
 90 | 		return static_cast<typename std::iterator_traits<ForwardIt>::value_type>(sample_from(distribution, engine));
 91 | 	});
 92 | }
 93 | 
 94 | template <typename ForwardIt, typename Size, typename Distribution, typename Engine = std::default_random_engine>
 95 | constexpr inline void generate_n(
 96 | 	ForwardIt first,
 97 | 	Size count,
 98 | 	Distribution& distribution,
 99 | 	Engine& engine = util::random::engine)
100 | {
101 | //	static_assert(is_iterator<ForwardIt>::value == true, "The 'first' parameter is not of an iterator type");
102 | 	// If we could rely on having C++17, we could generate in parallel...
103 | 	return generate(first, first + count, distribution, engine);
104 | }
105 | 
106 | template <typename Inserter, typename Size, typename Distribution, typename Engine = std::default_random_engine>
107 | constexpr inline void insertion_generate_n(
108 | 	Inserter inserter,
109 | 	Size count,
110 | 	Distribution& distribution,
111 | 	Engine& engine = util::random::engine)
112 | {
113 | 	for(size_t i = 0; i < count; i++) {
114 | 		*(inserter++) = sample_from(distribution, engine);
115 | 	}
116 | }
117 | 
118 | template <typename Inserter, typename Size, typename Distribution, typename Engine = std::default_random_engine>
119 | constexpr inline void insertion_generate_n(
120 | 	Inserter inserter,
121 | 	Size count,
122 | 	Distribution&& distribution,
123 | 	Engine& engine = util::random::engine)
124 | {
125 | 	insertion_generate_n(inserter, count, distribution, engine);
126 | }
127 | 
128 | template <typename RandomAccessIterator, typename Size, typename Engine = std::default_random_engine>
129 | constexpr inline std::unordered_set<typename std::iterator_traits<RandomAccessIterator>::value_type>
130 | sample_subset(
131 | 	RandomAccessIterator begin,
132 | 	RandomAccessIterator end,
133 | 	Size subset_size,
134 | 	Engine& engine = util::random::engine)
135 | {
136 | 	std::unordered_set<typename std::iterator_traits<RandomAccessIterator>::value_type> sampled_subset{};
137 | 	std::uniform_int_distribution<Size> distribution {0, (end - begin) - 1};
138 | 	while(sampled_subset.size() < subset_size) {
139 | 		auto sampled_element_index = util::random::sample_from(distribution, engine);
140 | 		sampled_subset.insert(*(begin + sampled_element_index));
141 | 	}
142 | 	return sampled_subset;
143 | }
144 | 
145 | template <typename RandomAccessIterator, typename Size, typename Engine = std::default_random_engine>
146 | constexpr inline std::unordered_set<typename std::iterator_traits<RandomAccessIterator>::value_type>
147 | sample_subset(
148 | 	RandomAccessIterator begin,
149 | 	Size domain_length,
150 | 	Size subset_size,
151 | 	Engine& engine = util::random::engine)
152 | {
153 | 	if (domain_length < subset_size) { throw std::invalid_argument("Can't sample a subset larger than the domain"); }
154 | 	std::unordered_set<typename std::iterator_traits<RandomAccessIterator>::value_type> sampled_subset{};
155 | 	if (domain_length == 0) {
156 | 		if (subset_size == 0) { return sampled_subset; }
157 | 		throw std::invalid_argument("Can't sample a subset larger than the domain");
158 | 	}
159 | 	std::uniform_int_distribution<Size> distribution {0, domain_length - 1};
160 | 	// TODO: If we need to sample more than half the domain, sample the elements _outside_ the set instead.
161 | 	while(sampled_subset.size() < subset_size) {
162 | 		auto sampled_element_index = util::random::sample_from(distribution, engine);
163 | 		sampled_subset.insert(*(begin + sampled_element_index));
164 | 	}
165 | 	return sampled_subset;
166 | }
167 | 
168 | template <typename Size, typename Engine = std::default_random_engine>
169 | constexpr inline std::unordered_set<Size>
170 | sample_index_subset(
171 | 	Size domain_length,
172 | 	Size subset_size,
173 | 	Engine& engine = util::random::engine)
174 | {
175 | 	if (domain_length < subset_size) { throw std::invalid_argument("Can't sample a subset larger than the domain"); }
176 | 	std::unordered_set<Size> sampled_subset{};
177 | 	if (domain_length == 0) {
178 | 		if (subset_size == 0) { return sampled_subset; }
179 | 		throw std::invalid_argument("Can't sample a subset larger than the domain");
180 | 	}
181 | 	std::uniform_int_distribution<Size> distribution {0, domain_length - 1};
182 | 	// TODO: If we need to sample more than half the domain, sample the elements _outside_ the set instead.
183 | 	while(sampled_subset.size() < subset_size) {
184 | 		sampled_subset.insert(util::random::sample_from(distribution, engine));
185 | 	}
186 | 	return sampled_subset;
187 | }
188 | 
189 | } // namespace random
190 | } // namespace util
191 | 
192 | #endif /* CUDA_KAT_TEST_UTILITIES_RANDOM_H_ */
193 | 
194 | 


--------------------------------------------------------------------------------
/tests/util/type_name.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #ifndef UTIL_TYPE_NAME_HPP_
  3 | #define UTIL_TYPE_NAME_HPP_
  4 | 
  5 | #include "poor_mans_constexpr_string.hpp"
  6 | #include <type_traits>
  7 | #include <typeinfo>
  8 | #include <iostream>
  9 | #ifndef _MSC_VER
 10 | #include <cxxabi.h>
 11 | #endif
 12 | #include <string>
 13 | #include <sstream>
 14 | #include <memory>
 15 | 
 16 | 
 17 | ///@cond
 18 | #include <kat/detail/execution_space_specifiers.hpp>
 19 | ///@endcond
 20 | 
 21 | namespace util {
 22 | 
 23 | template <class T>
 24 | CONSTEXPR14_TN KAT_HD constexpr_string type_name()
 25 | {
 26 | #ifdef __clang__
 27 |     constexpr_string p = __PRETTY_FUNCTION__;
 28 |     return constexpr_string(p.data() + 31, p.size() - 31 - 1);
 29 | //#elif defined(__CUDA_ARCH__)
 30 | //    constexpr_string p = __PRETTY_FUNCTION__;
 31 | //    return constexpr_string(p.data(), p.size());
 32 | #elif defined(__CUDACC__)
 33 |     constexpr_string p = __PRETTY_FUNCTION__;
 34 | #  if __cplusplus < 201402 || defined(__CUDA_ARCH__)
 35 |     return constexpr_string(p.data() + 51, p.size() - 51 - 1); // 50 is the length of util::constexpr_string util::type_name() [with T =
 36 | #  else
 37 |     return constexpr_string(p.data() + 61, p.size() - 61 - 1); // 50 is the length of constexpr util::constexpr_string util::type_name() [with T =
 38 | #  endif
 39 | #elif defined(__GNUC__)
 40 |     constexpr_string p = __PRETTY_FUNCTION__;
 41 | #  if __cplusplus < 201402
 42 |     return constexpr_string(p.data() + 36, p.size() - 36 - 1);
 43 | #  else
 44 |     return constexpr_string(p.data() + 46, p.size() - 46 - 1);
 45 | #  endif
 46 | #elif defined(_MSC_VER)
 47 |     constexpr_string p = __FUNCSIG__;
 48 |     return constexpr_string(p.data() + 38, p.size() - 38 - 7);
 49 | #endif
 50 | }
 51 | 
 52 | /*template <class T>
 53 | CONSTEXPR14_TN KAT_FHD constexpr_string type_name(T&&)
 54 | {
 55 | 	return type_name<T>();
 56 | }
 57 | 
 58 | template <class T>
 59 | CONSTEXPR14_TN KAT_FHD constexpr_string type_name(const T&)
 60 | {
 61 | 	return type_name<T>();
 62 | }*/
 63 | 
 64 | 
 65 | /**
 66 |  * A function for obtaining the string name
 67 |  * of a type, using that actual type at compile-time.
 68 |  * (The function might have been constexpr, but I doubt
 69 |  * so much is acceptable at compile time.) This is an
 70 |  * alternative to using type_info<T>.name() which also
 71 |  * preserves CV qualifiers (const, volatile, reference,
 72 |  *  rvalue-reference)
 73 |  *
 74 |  * The code was copied from this StackOverflow answer:
 75 |  *  http://stackoverflow.com/a/20170989/1593077
 76 |  * due to Howard Hinnant
 77 |  * ... with some slight modifications by Eyal Rozenberg
 78 |  */
 79 | 
 80 | 
 81 | template <typename T, bool WithCVCorrections = false>
 82 | std::string type_name_()
 83 | {
 84 | 	typedef typename std::remove_reference<T>::type TR;
 85 | 
 86 | 	std::unique_ptr<char, void(*)(void*)> own(
 87 | #ifndef _MSC_VER
 88 | 		abi::__cxa_demangle(typeid(TR).name(), nullptr,	nullptr, nullptr),
 89 | #else
 90 | 		nullptr,
 91 | #endif
 92 | 		std::free
 93 | 	);
 94 | 	std::string r = (own != nullptr) ? own.get() : typeid(TR).name();
 95 | 	if (WithCVCorrections) {
 96 | 		if (std::is_const<TR>::value)
 97 | 			r += " const";
 98 | 		if (std::is_volatile<TR>::value)
 99 | 			r += " volatile";
100 | 		if (std::is_lvalue_reference<T>::value)
101 | 			r += "&";
102 | 		else if (std::is_rvalue_reference<T>::value)
103 | 			r += "&&";
104 | 	}
105 | 	return r;
106 | }
107 | 
108 | /**
109 |  * This is a convenience function, so that instead of
110 |  *
111 |  *   util::type_name<decltype(my_value)>()
112 |  *
113 |  * you could use:
114 |  *
115 |  *   util::type_name_of(my_value)
116 |  *
117 |  * @param v a value which is only passed to indicate a type
118 |  * @return the string type name of typeof(v)
119 |  */
120 | template <typename T, bool WithCVCorrections = false>
121 | std::string type_name_of(const T& v) { return util::type_name<T, WithCVCorrections>(); }
122 | 
123 | 
124 | template <typename... Ts>
125 | auto type_names_() -> decltype(std::make_tuple(type_name_<Ts>()...))
126 | { return std::make_tuple(type_name_<Ts>()...); }
127 | 
128 | 
129 | /**
130 |  * Removed the trailing template parameter listing from a type name, e.g.
131 |  *
132 |  *   foo<int> bar<plus<int>>
133 |  *
134 |  * becomes
135 |  *
136 |  *   foo<int> bar<plus<int>>
137 |  *
138 |  * This is not such useful function, as int bar<int>(double x) will
139 |  * become int bar. So - fix it.
140 |  *
141 |  * @param type_name the name of a type, preferably obtained with
142 |  * util::type_info
143 |  * @return the template-less type name, or the original type name if
144 |  * we could not find anything to remove (doesn't throw)
145 |  */
146 | inline std::string discard_template_parameters(const std::string& type_name)
147 | {
148 | 	auto template_rbracket_pos = type_name.rfind('>');
149 | 	if (template_rbracket_pos == std::string::npos) {
150 | 		return type_name;
151 | 	}
152 | 	unsigned bracket_depth = 1;
153 | 	for (unsigned pos = template_rbracket_pos; pos > 0; pos++) {
154 | 		switch(type_name[pos]) {
155 | 		case '>': bracket_depth++; break;
156 | 		case '<': bracket_depth--; break;
157 | 		}
158 | 		if (bracket_depth == 0) return type_name.substr(0,pos);
159 | 	}
160 | 	return type_name;
161 | }
162 | 
163 | } /* namespace util */
164 | 
165 | #include <kat/detail/execution_space_specifiers.hpp>
166 | 
167 | #endif /* UTIL_TYPE_NAME_HPP_ */
168 | 


--------------------------------------------------------------------------------
/tests/util/woodruff_int128_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | 
  5 | struct int128_t final
  6 | {
  7 | 	int128_t() = default;
  8 | 	constexpr int128_t(const int64_t high_, const uint64_t low_) : high(high_), low(low_) {}
  9 | 	constexpr int128_t(const int64_t v) : high(v < 0 ? 0xfffffffffffffffflu : 0), low(v) {}
 10 | 
 11 | 	explicit constexpr operator int64_t() const { return static_cast<int64_t>(low); }
 12 | 
 13 | 	constexpr operator bool() const { return low || high; }
 14 | 
 15 | 	int64_t high;
 16 | 	uint64_t low;
 17 | };
 18 | 
 19 | inline constexpr bool operator<(const int128_t l, const int128_t r)
 20 | {
 21 | 	return l.high < r.high || (l.high == r.high && l.low < r.low);
 22 | }
 23 | 
 24 | inline constexpr bool operator<=(const int128_t l, const int128_t r)
 25 | {
 26 | 	return l.high < r.high || (l.high == r.high && l.low <= r.low);
 27 | }
 28 | 
 29 | inline constexpr bool operator>(const int128_t l, const int128_t r)
 30 | {
 31 | 	return l.high > r.high || (l.high == r.high && l.low > r.low);
 32 | }
 33 | 
 34 | inline constexpr bool operator>=(const int128_t l, const int128_t r)
 35 | {
 36 | 	return l.high > r.high || (l.high == r.high && l.low >= r.low);
 37 | }
 38 | 
 39 | inline constexpr bool operator==(const int128_t l, const int128_t r)
 40 | {
 41 | 	return l.low == r.low && l.high == r.high;
 42 | }
 43 | 
 44 | inline constexpr bool operator!=(const int128_t l, const int128_t r)
 45 | {
 46 | 	return l.low != r.low && l.high != r.high;
 47 | }
 48 | 
 49 | inline constexpr int128_t operator+(const int128_t l, const int128_t r)
 50 | {
 51 | 	int128_t result{l.high + r.high, l.low + r.low};
 52 | 	if (result.low < l.low)
 53 | 	{
 54 | 		++result.high;
 55 | 	}
 56 | 	return result;
 57 | }
 58 | 
 59 | inline constexpr int128_t operator-(const int128_t l, const int128_t r)
 60 | {
 61 | 	int128_t result{l.high - r.high, l.low - r.low};
 62 | 	if (result.low > l.low)
 63 | 	{
 64 | 		--result.high;
 65 | 	}
 66 | 	return result;
 67 | }
 68 | 
 69 | inline constexpr int128_t operator*(const int128_t l, const int128_t r)
 70 | {
 71 | 	int128_t result{static_cast<int64_t>((l.low >> 32) * (r.low >> 32)), (l.low & 0xffffffff) * (r.low & 0xffffffff)};
 72 | 	{
 73 | 		const uint64_t m12 = (l.low & 0xffffffff) * (r.low >> 32);
 74 | 		{
 75 | 			const uint64_t m12_l = (m12 & 0xffffffff) << 32;
 76 | 			const uint64_t old_low = result.low;
 77 | 			result.low += m12_l;
 78 | 			if (result.low < old_low)
 79 | 			{
 80 | 				++result.high;
 81 | 			}
 82 | 			result.high += (m12 >> 32);
 83 | 
 84 | 		}
 85 | 	}
 86 | 	{
 87 | 		const uint64_t m21 = (l.low >> 32) * (r.low & 0xffffffff);
 88 | 		{
 89 | 			const uint64_t m21_l = (m21 & 0xffffffff) << 32;
 90 | 			const uint64_t old_low = result.low;
 91 | 			result.low += m21_l;
 92 | 			if (result.low < old_low)
 93 | 			{
 94 | 				++result.high;
 95 | 			}
 96 | 			result.high += static_cast<int64_t>((m21 >> 32));
 97 | 		}
 98 | 	}
 99 | 	result.high +=
100 | 		static_cast<int64_t>(
101 | 			(l.low & 0xffffffff) * (static_cast<uint64_t>(r.high) & 0xffffffff) +
102 | 			(static_cast<uint64_t>(l.high) & 0xffffffff) * (r.low & 0xffffffff) +
103 | 			(((l.low & 0xffffffff) * (static_cast<uint64_t>(r.high) >> 32)) << 32) +
104 | 			(((static_cast<uint64_t>(l.high) >> 32) * (static_cast<uint64_t>(r.low) & 0xffffffff)) << 32) +
105 | 			(((l.low >> 32) * (static_cast<uint64_t>(r.high) & 0xffffffff)) << 32) +
106 | 			(((static_cast<uint64_t>(l.high) & 0xffffffff) * (r.low >> 32)) << 32));
107 | 
108 | 	return result;
109 | }
110 | 
111 | /*
112 | inline constexpr int128_t operator/(const int128_t l, const int128_t r)
113 | {
114 | 	//! \todo implement
115 | 	return l;
116 | }
117 | 
118 | inline constexpr int128_t operator%(const int128_t l, const int128_t r)
119 | {
120 | 	//! \todo implement
121 | 	return l;
122 | }
123 | */
124 | 
125 | inline constexpr int128_t operator~(const int128_t v)
126 | {
127 | 	return int128_t{~v.high, ~v.low};
128 | }
129 | 
130 | inline constexpr int128_t operator&(const int128_t l, const int128_t r)
131 | {
132 | 	return int128_t{l.high & r.high, l.low & r.low};
133 | }
134 | 
135 | inline constexpr int128_t operator|(const int128_t l, const int128_t r)
136 | {
137 | 	return int128_t{l.high | r.high, l.low | r.low};
138 | }
139 | 
140 | inline constexpr int128_t operator^(const int128_t l, const int128_t r)
141 | {
142 | 	return int128_t{l.high ^ r.high, l.low ^ r.low};
143 | }
144 | 
145 | inline constexpr int128_t operator<<(const int128_t v, const unsigned s)
146 | {
147 | 	if (s >= 64)
148 | 	{
149 | 		return {static_cast<int64_t>(v.low) << (s - 64), 0};
150 | 	}
151 | 	return {v.high << s | static_cast<int64_t>(v.low >> (64 - s)), v.low << s};
152 | }
153 | 
154 | inline constexpr int128_t operator>>(const int128_t v, const unsigned s)
155 | {
156 | 	if (s >= 64)
157 | 	{
158 | 		return {v.high >> s, static_cast<uint64_t>(v.high >> (s - 64))};
159 | 	}
160 | 	return {v.high >> s, v.high << (64 - s) | v.low >> s};
161 | }
162 | 
163 | inline constexpr int128_t & operator++(int128_t & v)
164 | {
165 | 	++v.low;
166 | 	if (!v.low)
167 | 	{
168 | 		++v.high;
169 | 	}
170 | 	return v;
171 | }
172 | 
173 | inline constexpr int128_t & operator--(int128_t & v)
174 | {
175 | 	if (!v.low)
176 | 	{
177 | 		--v.high;
178 | 	}
179 | 	--v.low;
180 | 	return v;
181 | }
182 | 
183 | inline constexpr int128_t operator++(int128_t & v, int)
184 | {
185 | 	int128_t r = v;
186 | 	++v;
187 | 	return r;
188 | }
189 | 
190 | inline constexpr int128_t operator--(int128_t & v, int)
191 | {
192 | 	int128_t r = v;
193 | 	--v;
194 | 	return r;
195 | }
196 | 
197 | inline constexpr int128_t operator-(const int128_t v)
198 | {
199 | 	int128_t result{~v.high, ~v.low};
200 | 	++result;
201 | 	return result;
202 | }
203 | 
204 | inline constexpr int128_t & operator+=(int128_t & l, const int128_t r)
205 | {
206 | 	const uint64_t low = l.low;
207 | 	l.low += r.low;
208 | 	l.high += r.high;
209 | 	if (l.low < low)
210 | 	{
211 | 		++l.high;
212 | 	}
213 | 	return l;
214 | }
215 | 
216 | inline constexpr int128_t & operator-=(int128_t & l, const int128_t r)
217 | {
218 | 	const uint64_t low = l.low;
219 | 	l.low -= r.low;
220 | 	l.high -= r.high;
221 | 	if (l.low > low)
222 | 	{
223 | 		--l.high;
224 | 	}
225 | 	return l;
226 | }
227 | 
228 | inline constexpr int128_t & operator*=(int128_t & l, const int128_t r)
229 | {
230 | 	l = l * r;
231 | 	return l;
232 | }
233 | 
234 | /*
235 | inline constexpr int128_t & operator/=(int128_t & l, const int128_t r)
236 | {
237 | 	//! \todo implement
238 | 	return l;
239 | }
240 | 
241 | inline constexpr int128_t & operator%=(int128_t & l, const int128_t r)
242 | {
243 | 	//! \todo implement
244 | 	return l;
245 | }
246 | */
247 | 
248 | inline constexpr int128_t & operator&=(int128_t & l, const int128_t r)
249 | {
250 | 	l.high &= r.high;
251 | 	l.low &= r.low;
252 | 	return l;
253 | }
254 | 
255 | inline constexpr int128_t & operator|=(int128_t & l, const int128_t r)
256 | {
257 | 	l.high |= r.high;
258 | 	l.low |= r.low;
259 | 	return l;
260 | }
261 | 
262 | inline constexpr int128_t & operator^=(int128_t & l, const int128_t r)
263 | {
264 | 	l.high ^= r.high;
265 | 	l.low ^= r.low;
266 | 	return l;
267 | }
268 | 
269 | inline constexpr int128_t & operator<<=(int128_t & v, const unsigned s)
270 | {
271 | 	if (s >= 64)
272 | 	{
273 | 		v.high = v.low << (s - 64);
274 | 		v.low = 0;
275 | 	}
276 | 	else
277 | 	{
278 | 		v.high = v.high << s | v.low >> (64 - s);
279 | 		v.low <<= s;
280 | 	}
281 | 	return v;
282 | }
283 | 
284 | inline constexpr int128_t & operator>>=(int128_t & v, const unsigned s)
285 | {
286 | 	if (s >= 64)
287 | 	{
288 | 		v.low = v.high >> (s - 64);
289 | 	}
290 | 	else
291 | 	{
292 | 		v.low = v.high << (64 - s) | v.low >> s;
293 | 	}
294 | 	v.high >>= s;
295 | 	return v;
296 | }
297 | 


--------------------------------------------------------------------------------
/tests/util/woodruff_uint128_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | 
  5 | struct uint128_t final
  6 | {
  7 | 	uint128_t() = default;
  8 | 	constexpr uint128_t(const uint64_t high_, const uint64_t low_) : high(high_), low(low_) {}
  9 | 	constexpr uint128_t(const uint64_t v) : high(0), low(v) {}
 10 | 
 11 | 	explicit constexpr operator uint64_t() const { return low; }
 12 | 
 13 | 	constexpr operator bool() const { return low || high; }
 14 | 
 15 | 	uint64_t high;
 16 | 	uint64_t low;
 17 | };
 18 | 
 19 | inline constexpr bool operator<(const uint128_t l, const uint128_t r)
 20 | {
 21 | 	return l.high < r.high || (l.high == r.high && l.low < r.low);
 22 | }
 23 | 
 24 | inline constexpr bool operator<=(const uint128_t l, const uint128_t r)
 25 | {
 26 | 	return l.high < r.high || (l.high == r.high && l.low <= r.low);
 27 | }
 28 | 
 29 | inline constexpr bool operator>(const uint128_t l, const uint128_t r)
 30 | {
 31 | 	return l.high > r.high || (l.high == r.high && l.low > r.low);
 32 | }
 33 | 
 34 | inline constexpr bool operator>=(const uint128_t l, const uint128_t r)
 35 | {
 36 | 	return l.high > r.high || (l.high == r.high && l.low >= r.low);
 37 | }
 38 | 
 39 | inline constexpr bool operator==(const uint128_t l, const uint128_t r)
 40 | {
 41 | 	return l.low == r.low && l.high == r.high;
 42 | }
 43 | 
 44 | inline constexpr bool operator!=(const uint128_t l, const uint128_t r)
 45 | {
 46 | 	return l.low != r.low || l.high != r.high;
 47 | }
 48 | 
 49 | inline constexpr uint128_t operator+(const uint128_t l, const uint128_t r)
 50 | {
 51 | 	uint128_t result{l.high + r.high, l.low + r.low};
 52 | 	if (result.low < l.low)
 53 | 	{
 54 | 		++result.high;
 55 | 	}
 56 | 	return result;
 57 | }
 58 | 
 59 | inline constexpr uint128_t operator-(const uint128_t l, const uint128_t r)
 60 | {
 61 | 	uint128_t result{l.high - r.high, l.low - r.low};
 62 | 	if (result.low > l.low)
 63 | 	{
 64 | 		--result.high;
 65 | 	}
 66 | 	return result;
 67 | }
 68 | 
 69 | inline constexpr uint128_t operator*(const uint128_t l, const uint128_t r)
 70 | {
 71 | 	uint128_t result{(l.low >> 32) * (r.low >> 32), (l.low & 0xffffffff) * (r.low & 0xffffffff)};
 72 | 	{
 73 | 		const uint64_t m12 = (l.low & 0xffffffff) * (r.low >> 32);
 74 | 		{
 75 | 			const uint64_t m12_l = (m12 & 0xffffffff) << 32;
 76 | 			const uint64_t old_low = result.low;
 77 | 			result.low += m12_l;
 78 | 			if (result.low < old_low)
 79 | 			{
 80 | 				++result.high;
 81 | 			}
 82 | 			result.high += (m12 >> 32);
 83 | 			
 84 | 		}
 85 | 	}
 86 | 	{
 87 | 		const uint64_t m21 = (l.low >> 32) * (r.low & 0xffffffff);
 88 | 		{
 89 | 			const uint64_t m21_l = (m21 & 0xffffffff) << 32;
 90 | 			const uint64_t old_low = result.low;
 91 | 			result.low += m21_l;
 92 | 			if (result.low < old_low)
 93 | 			{
 94 | 				++result.high;
 95 | 			}
 96 | 			result.high += (m21 >> 32);
 97 | 		}
 98 | 	}
 99 | 	result.high +=
100 | 		(l.low & 0xffffffff) * (r.high & 0xffffffff) +
101 | 		(l.high & 0xffffffff) * (r.low & 0xffffffff) +
102 | 		(((l.low & 0xffffffff) * (r.high >> 32)) << 32) +
103 | 		(((l.high >> 32) * (r.low & 0xffffffff)) << 32) +
104 | 		(((l.low >> 32) * (r.high & 0xffffffff)) << 32) +
105 | 		(((l.high & 0xffffffff) * (r.low >> 32)) << 32);
106 | 
107 | 	return result;
108 | }
109 | 
110 | /*
111 | inline constexpr uint128_t operator/(const uint128_t l, const uint128_t r)
112 | {
113 | 	//! \todo implement
114 | 	return {};
115 | }
116 | 
117 | inline constexpr uint128_t operator%(const uint128_t l, const uint128_t r)
118 | {
119 | 	//! \todo implement
120 | 	return {};
121 | }
122 | */
123 | 
124 | inline constexpr uint128_t operator~(const uint128_t v)
125 | {
126 | 	return uint128_t{~v.high, ~v.low};
127 | }
128 | 
129 | inline constexpr uint128_t operator&(const uint128_t l, const uint128_t r)
130 | {
131 | 	return uint128_t{l.high & r.high, l.low & r.low};
132 | }
133 | 
134 | inline constexpr uint128_t operator|(const uint128_t l, const uint128_t r)
135 | {
136 | 	return uint128_t{l.high | r.high, l.low | r.low};
137 | }
138 | 
139 | inline constexpr uint128_t operator^(const uint128_t l, const uint128_t r)
140 | {
141 | 	return uint128_t{l.high ^ r.high, l.low ^ r.low};
142 | }
143 | 
144 | inline constexpr uint128_t operator<<(const uint128_t v, const unsigned s)
145 | {
146 | 	if (s >= 64)
147 | 	{
148 | 		return {v.low << (s - 64), 0};
149 | 	}
150 | 	return {v.high << s | v.low >> (64 - s), v.low << s};
151 | }
152 | 
153 | inline constexpr uint128_t operator>>(const uint128_t v, const unsigned s)
154 | {
155 | 	if (s >= 64)
156 | 	{
157 | 		return {0, v.high >> (s - 64)};
158 | 	}
159 | 	return {v.high >> s, v.high << (64 - s) | v.low >> s};
160 | }
161 | 
162 | inline constexpr uint128_t & operator++(uint128_t & v)
163 | {
164 | 	++v.low;
165 | 	if (!v.low)
166 | 	{
167 | 		++v.high;
168 | 	}
169 | 	return v;
170 | }
171 | 
172 | inline constexpr uint128_t & operator--(uint128_t & v)
173 | {
174 | 	if (!v.low)
175 | 	{
176 | 		--v.high;
177 | 	}
178 | 	--v.low;
179 | 	return v;
180 | }
181 | 
182 | inline constexpr uint128_t operator++(uint128_t & v, int)
183 | {
184 | 	uint128_t r = v;
185 | 	++v;
186 | 	return r;
187 | }
188 | 
189 | inline constexpr uint128_t operator--(uint128_t & v, int)
190 | {
191 | 	uint128_t r = v;
192 | 	--v;
193 | 	return r;
194 | }
195 | 
196 | inline constexpr uint128_t & operator+=(uint128_t & l, const uint128_t r)
197 | {
198 | 	const uint64_t low = l.low;
199 | 	l.low += r.low;
200 | 	l.high += r.high;
201 | 	if (l.low < low)
202 | 	{
203 | 		++l.high;
204 | 	}
205 | 	return l;
206 | }
207 | 
208 | inline constexpr uint128_t & operator-=(uint128_t & l, const uint128_t r)
209 | {
210 | 	const uint64_t low = l.low;
211 | 	l.low -= r.low;
212 | 	l.high -= r.high;
213 | 	if (l.low > low)
214 | 	{
215 | 		--l.high;
216 | 	}
217 | 	return l;
218 | }
219 | 
220 | inline constexpr uint128_t & operator*=(uint128_t & l, const uint128_t r)
221 | {
222 | 	l = l * r;
223 | 	return l;
224 | }
225 | 
226 | /*
227 | inline constexpr uint128_t & operator/=(uint128_t & l, const uint128_t r)
228 | {
229 | 	//! \todo implement
230 | 	return l;
231 | }
232 | 
233 | inline constexpr uint128_t & operator%=(uint128_t & l, const uint128_t r)
234 | {
235 | 	//! \todo implement
236 | 	return l;
237 | }
238 | */
239 | 
240 | inline constexpr uint128_t & operator&=(uint128_t & l, const uint128_t r)
241 | {
242 | 	l.high &= r.high;
243 | 	l.low &= r.low;
244 | 	return l;
245 | }
246 | 
247 | inline constexpr uint128_t & operator|=(uint128_t & l, const uint128_t r)
248 | {
249 | 	l.high |= r.high;
250 | 	l.low |= r.low;
251 | 	return l;
252 | }
253 | 
254 | inline constexpr uint128_t & operator^=(uint128_t & l, const uint128_t r)
255 | {
256 | 	l.high ^= r.high;
257 | 	l.low ^= r.low;
258 | 	return l;
259 | }
260 | 
261 | inline constexpr uint128_t & operator<<=(uint128_t & v, const unsigned s)
262 | {
263 | 	if (s >= 64)
264 | 	{
265 | 		v.high = v.low << (s - 64);
266 | 		v.low = 0;
267 | 	}
268 | 	else
269 | 	{
270 | 		v.high = v.high << s | v.low >> (64 - s);
271 | 		v.low <<= s;
272 | 	}
273 | 	return v;
274 | }
275 | 
276 | inline constexpr uint128_t & operator>>=(uint128_t & v, const unsigned s)
277 | {
278 | 	if (s >= 64)
279 | 	{
280 | 		v.low = v.high >> (s - 64);
281 | 		v.high = 0;
282 | 	}
283 | 	else
284 | 	{
285 | 		v.low = v.high << (64 - s) | v.low >> s;
286 | 		v.high >>= s;
287 | 	}
288 | 	return v;
289 | }
290 | 


--------------------------------------------------------------------------------