├── .gitignore ├── CMakeLists.txt ├── CONTRIBUTORS.txt ├── LICENSE.TXT ├── README.md ├── cmake ├── anydsl_runtime-config.cmake.in ├── build_xilinx_hls.cmake.in ├── check_nvvmir.cmake └── modules │ ├── FindLevelZero.cmake │ └── FindXHLS.cmake ├── platforms ├── artic │ ├── intrinsics.impala │ ├── intrinsics_amdgpu.impala │ ├── intrinsics_cpu.impala │ ├── intrinsics_cuda.impala │ ├── intrinsics_hls.impala │ ├── intrinsics_levelzero.impala │ ├── intrinsics_math.impala │ ├── intrinsics_nvvm.impala │ ├── intrinsics_opencl.impala │ ├── intrinsics_rv.impala │ ├── intrinsics_spirv.impala │ ├── intrinsics_thorin.impala │ ├── intrinsics_wmma.impala │ └── runtime.impala └── impala │ ├── intrinsics.impala │ ├── intrinsics_amdgpu.impala │ ├── intrinsics_cpu.impala │ ├── intrinsics_cuda.impala │ ├── intrinsics_hls.impala │ ├── intrinsics_nvvm.impala │ ├── intrinsics_opencl.impala │ ├── intrinsics_rv.impala │ ├── intrinsics_thorin.impala │ └── runtime.impala ├── post-patcher.py └── src ├── CMakeLists.txt ├── anydsl_jit.h ├── anydsl_runtime.cpp ├── anydsl_runtime.h ├── anydsl_runtime.hpp ├── anydsl_runtime_config.h.in ├── cpu_platform.cpp ├── cpu_platform.h ├── cuda_platform.cpp ├── cuda_platform.h ├── dummy_platform.h ├── extract_runtime_srcs.py ├── hsa_platform.cpp ├── hsa_platform.h ├── jit.cpp ├── levelzero_platform.cpp ├── levelzero_platform.h ├── log.h ├── opencl_platform.cpp ├── opencl_platform.h ├── pal ├── pal_device.cpp ├── pal_device.h ├── pal_fix_calling_convention_pass.cpp ├── pal_fix_calling_convention_pass.h ├── pal_insert_halt_pass.cpp ├── pal_insert_halt_pass.h ├── pal_lower_builtins_pass.cpp ├── pal_lower_builtins_pass.h ├── pal_lower_kernel_arguments_pass.cpp ├── pal_lower_kernel_arguments_pass.h ├── pal_utils.cpp └── pal_utils.h ├── pal_platform.cpp ├── pal_platform.h ├── platform.h ├── runtime.cpp └── runtime.h /.gitignore: -------------------------------------------------------------------------------- 1 | build* 2 | 3 | .vscode -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) 2 | 3 | project(AnyDSL-runtime) 4 | 5 | set(PACKAGE_VERSION "0.3.9") 6 | #set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "limited config" FORCE) 7 | set(CMAKE_CXX_STANDARD 17) 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 9 | 10 | option(BUILD_SHARED_LIBS "Build shared libraries" ON) 11 | option(RUNTIME_JIT "enable jit support in the runtime" OFF) 12 | option(DEBUG_OUTPUT "enable debug output" OFF) 13 | 14 | if(CMAKE_BUILD_TYPE STREQUAL "") 15 | set(CMAKE_BUILD_TYPE Debug CACHE STRING "Debug or Release" FORCE) 16 | endif() 17 | 18 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules) 19 | 20 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 21 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 22 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 23 | 24 | find_path(Artic_DIR NAMES artic-config.cmake PATHS ${Artic_DIR} $ENV{Artic_DIR} ${CMAKE_BINARY_DIR}/share/anydsl/cmake) 25 | find_path(Impala_DIR NAMES impala-config.cmake PATHS ${Impala_DIR} $ENV{Impala_DIR} ${CMAKE_BINARY_DIR}/share/anydsl/cmake) 26 | 27 | set(AnyDSL_runtime_ENABLE_DEBUG_OUTPUT ${DEBUG_OUTPUT}) 28 | set(AnyDSL_runtime_TARGET_NAME runtime CACHE STRING "Name of the cmake target for the AnyDSL runtime") 29 | mark_as_advanced(AnyDSL_runtime_TARGET_NAME) 30 | 31 | add_subdirectory(src) 32 | 33 | message(STATUS "Using Debug flags: ${CMAKE_CXX_FLAGS_DEBUG}") 34 | message(STATUS "Using Release flags: ${CMAKE_CXX_FLAGS_RELEASE}") 35 | if(DEFINED CMAKE_BUILD_TYPE) 36 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 37 | endif() 38 | 39 | export(TARGETS ${RUNTIME_LIBRARIES} FILE ${CMAKE_BINARY_DIR}/share/anydsl/cmake/anydsl_runtime-exports.cmake) 40 | configure_file(cmake/anydsl_runtime-config.cmake.in ${CMAKE_BINARY_DIR}/share/anydsl/cmake/anydsl_runtime-config.cmake @ONLY) 41 | configure_file(cmake/build_xilinx_hls.cmake.in ${CMAKE_BINARY_DIR}/share/anydsl/cmake/build_xilinx_hls.cmake @ONLY) 42 | -------------------------------------------------------------------------------- /CONTRIBUTORS.txt: -------------------------------------------------------------------------------- 1 | # This is the official list of contributing authors in the AnyDSL runtime project for copyright purposes. 2 | 3 | # Name (GitHub Handle), Affiliation(s) 4 | Puya Amiri (pooyaww), DFKI 5 | Hugo Devillers (Hugobros3), Saarland University 6 | Pascal Grittmann (pgrit), Saarland University 7 | Ralf Jung (RalfJung), Saarland University 8 | Michael Kenzel (michael-kenzel), DFKI 9 | Marcel Köster (m4rs-mt), Saarland University 10 | Matthis Kruse (DasNaCl), Saarland University 11 | Matthias Kurtenacker (m-kurtenacker), DFKI 12 | Roland Leißa (leissa), Saarland University 13 | Stefan Lemme (stlemme), Saarland University / DFKI 14 | Richard Membarth (richardmembarth), Saarland University / DFKI / Technische Hochschule Ingolstadt 15 | Simon Moll (simoll), Saarland University 16 | Arsène Pérard-Gayot (madmann91), Saarland University 17 | Akif Özkan (akifoezkan), Friedrich-Alexander-University Erlangen-Nuremberg 18 | Alexander Rath (iRath96), DFKI 19 | Till Speicher (tillspeicher), Saarland University 20 | Fabian Wildgrube (FabianWildgrube), Advanced Micro Devices Inc. 21 | Ömercan Yazici (PearCoding), Saarland University 22 | -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AnyDSL Runtime Library 2 | The runtime for the AnyDSL framework and its two frontends [artic](https://github.com/AnyDSL/artic) and [impala](https://github.com/AnyDSL/impala). 3 | 4 | The runtime provides the following components: 5 | - CMake logic to build programs using artic or impala 6 | - include files for basic runtime abstractions and math functions 7 | - runtime library implementation to schedule and execute AnyDSL programs on different platforms 8 | + Host (CPU): standard platform for code 9 | + TBB / C++11 threads: code emitted by `parallel` 10 | + LLVM w/ RV support: code emitted by `vectorize` 11 | + CUDA: code emitted by `cuda` or `nvvm` 12 | + OpenCL: code emitted by `opencl` 13 | + HSA: code emitted by `amdgpu` 14 | 15 | CMake automatically search for available components on the current system. 16 | To prevent CMake from building a particular runtime component, disable it using CMake's `CMAKE_DISABLE_FIND_PACKAGE_` variable. 17 | For example, pass `-DCMAKE_DISABLE_FIND_PACKAGE_OpenCL=TRUE` to cmake to disable the OpenCL runtime component. 18 | 19 | Although not required, feel free to specify `Artic_DIR` or `Impala_DIR` for your convenience to later automatically find the correct paths when building AnyDSL programs using the `anydsl_runtime_wrap()` function. 20 | 21 | To enable JIT support, please pass `-DRUNTIME_JIT=ON` to cmake. 22 | This will require atleast one of artic or impala as dependencies and thereby locate LLVM as well as [thorin](https://github.com/AnyDSL/thorin) too. 23 | -------------------------------------------------------------------------------- /cmake/check_nvvmir.cmake: -------------------------------------------------------------------------------- 1 | if(EXISTS ${_basename}.nvvm) 2 | execute_process(COMMAND ${LLVM_AS_BIN} ${_basename}.nvvm) 3 | endif() 4 | -------------------------------------------------------------------------------- /cmake/modules/FindLevelZero.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2019 Intel Corporation 2 | # SPDX-License-Identifier: MIT 3 | include(FindPackageHandleStandardArgs) 4 | 5 | find_path(LevelZero_INCLUDE_DIR 6 | NAMES level_zero/ze_api.h 7 | ) 8 | 9 | find_library(LevelZero_LIBRARY 10 | NAMES ze_loader ze_loader32 ze_loader64 11 | ) 12 | 13 | find_package_handle_standard_args(LevelZero 14 | REQUIRED_VARS 15 | LevelZero_INCLUDE_DIR 16 | LevelZero_LIBRARY 17 | HANDLE_COMPONENTS 18 | ) 19 | mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIR) 20 | 21 | if(LevelZero_FOUND) 22 | list(APPEND LevelZero_LIBRARIES ${LevelZero_LIBRARY} ${CMAKE_DL_LIBS}) 23 | list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) 24 | if(OpenCL_FOUND) 25 | list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}) 26 | endif() 27 | MESSAGE(STATUS "Found Level Zero in " ${LevelZero_INCLUDE_DIR}) 28 | endif() 29 | 30 | if(LevelZero_FOUND AND NOT TARGET LevelZero::LevelZero) 31 | add_library(LevelZero::LevelZero INTERFACE IMPORTED) 32 | set_target_properties(LevelZero::LevelZero 33 | PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}" 34 | ) 35 | set_target_properties(LevelZero::LevelZero 36 | PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}" 37 | ) 38 | endif() 39 | 40 | # MESSAGE(STATUS "LevelZero_LIBRARIES: " ${LevelZero_LIBRARIES}) 41 | # MESSAGE(STATUS "LevelZero_INCLUDE_DIRS: " ${LevelZero_INCLUDE_DIRS}) 42 | -------------------------------------------------------------------------------- /cmake/modules/FindXHLS.cmake: -------------------------------------------------------------------------------- 1 | # Xilinx Runtime library (XRT) and HLS tools for scripting mode 2 | 3 | find_path(XILINX_SEARCH_PATH v++ PATHS ENV XILINX_OPENCL ENV XILINX_VITIS PATH_SUFFIXES bin) 4 | get_filename_component(VITIS_ROOT_DIR ${XILINX_SEARCH_PATH} DIRECTORY) 5 | 6 | find_program(Xilinx_VPP v++ PATHS ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH) 7 | find_program(Xilinx_PLATFORM_INFO platforminfo PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH) 8 | find_program(Xilinx_KERNEL_INFO kernelinfo PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH) 9 | find_program(Xilinx_EMU_CONFIG emconfigutil PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH) 10 | 11 | 12 | get_filename_component(VITIS_VERSION "${VITIS_ROOT_DIR}" NAME) 13 | string(REGEX REPLACE "([0-9]+)\\.[0-9]+" "\\1" VITIS_MAJOR_VERSION "${VITIS_VERSION}") 14 | string(REGEX REPLACE "[0-9]+\\.([0-9]+)" "\\1" VITIS_MINOR_VERSION "${VITIS_VERSION}") 15 | set(Vitis_VERSION ${VITIS_VERSION}) 16 | set(Vitis_MAJOR_VERSION ${VITIS_MAJOR_VERSION}) 17 | set(Vitis_MINOR_VERSION ${VITIS_MINOR_VERSION}) 18 | 19 | find_program(Xilinx_HLS NAMES vitis_hls PATHS ${VITIS_ROOT_DIR}/bin ${VITIS_ROOT_DIR}/../../Vitis_HLS/${Vitis_VERSION}/bin NO_DEFAULT_PATH) 20 | 21 | find_path(Xilinx_HLS_INCLUDE_DIR hls_stream.h PATHS ${VITIS_ROOT_DIR}/include NO_DEFAULT_PATH) 22 | 23 | find_path(XRT_SEARCH_PATH libxilinxopencl.so PATHS /opt/xilinx/xrt ENV XILINX_XRT PATH_SUFFIXES lib) 24 | get_filename_component(XILINX_RUNTIME_DIR ${XRT_SEARCH_PATH} DIRECTORY) 25 | file(GLOB Xilinx_LIBRARIES ${XILINX_RUNTIME_DIR}/lib/libxilinxopencl.so) 26 | 27 | find_path(Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR cl_ext.h PATHS ${XILINX_RUNTIME_DIR}/include PATH_SUFFIXES CL NO_DEFAULT_PATH) 28 | get_filename_component(Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR ${Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR} DIRECTORY) 29 | set(Xilinx_INCLUDE_DIRS ${Xilinx_HLS_INCLUDE_DIR} ${Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR}) 30 | 31 | mark_as_advanced( 32 | XILINX_RUNTIME_DIR 33 | XRT_SEARCH_PATH 34 | XILINX_SEARCH_PATH 35 | Xilinx_HLS 36 | Xilinx_VPP 37 | Xilinx_HLS_INCLUDE_DIR 38 | Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR 39 | Xilinx_PLATFORM_INFO 40 | Xilinx_KERNEL_INFO 41 | Xilinx_EMU_CONFIG 42 | Xilinx_LIBRARIES 43 | Xilinx_INCLUDE_DIRS) 44 | 45 | include(FindPackageHandleStandardArgs) 46 | find_package_handle_standard_args(XHLS DEFAULT_MSG 47 | Xilinx_HLS 48 | Xilinx_VPP 49 | Xilinx_LIBRARIES 50 | Xilinx_INCLUDE_DIRS 51 | Xilinx_PLATFORM_INFO 52 | Xilinx_KERNEL_INFO 53 | Xilinx_EMU_CONFIG 54 | ) 55 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics.impala: -------------------------------------------------------------------------------- 1 | struct WorkItem { 2 | tidx : fn() -> i32, 3 | tidy : fn() -> i32, 4 | tidz : fn() -> i32, 5 | bidx : fn() -> i32, 6 | bidy : fn() -> i32, 7 | bidz : fn() -> i32, 8 | gidx : fn() -> i32, 9 | gidy : fn() -> i32, 10 | gidz : fn() -> i32, 11 | bdimx : fn() -> i32, 12 | bdimy : fn() -> i32, 13 | bdimz : fn() -> i32, 14 | gdimx : fn() -> i32, 15 | gdimy : fn() -> i32, 16 | gdimz : fn() -> i32, 17 | nblkx : fn() -> i32, 18 | nblky : fn() -> i32, 19 | nblkz : fn() -> i32 20 | } 21 | 22 | struct Accelerator { 23 | exec : fn(fn(WorkItem) -> ()) -> fn((i32, i32, i32), (i32, i32, i32)) -> (), // fn(grid, block)->() 24 | sync : fn() -> (), 25 | alloc : fn(i64) -> Buffer, 26 | alloc_unified : fn(i64) -> Buffer, 27 | barrier : fn() -> () 28 | } 29 | 30 | struct Intrinsics { 31 | expf : fn(f32) -> f32, 32 | exp2f : fn(f32) -> f32, 33 | logf : fn(f32) -> f32, 34 | log2f : fn(f32) -> f32, 35 | powf : fn(f32, f32) -> f32, 36 | rsqrtf : fn(f32) -> f32, 37 | sqrtf : fn(f32) -> f32, 38 | fabsf : fn(f32) -> f32, 39 | sinf : fn(f32) -> f32, 40 | cosf : fn(f32) -> f32, 41 | tanf : fn(f32) -> f32, 42 | asinf : fn(f32) -> f32, 43 | acosf : fn(f32) -> f32, 44 | atanf : fn(f32) -> f32, 45 | erff : fn(f32) -> f32, 46 | atan2f : fn(f32, f32) -> f32, 47 | copysignf : fn(f32, f32) -> f32, 48 | fmaf : fn(f32, f32, f32) -> f32, 49 | fmaxf : fn(f32, f32) -> f32, 50 | fminf : fn(f32, f32) -> f32, 51 | fmodf : fn(f32, f32) -> f32, 52 | floorf : fn(f32) -> f32, 53 | isinff : fn(f32) -> i32, 54 | isnanf : fn(f32) -> i32, 55 | isfinitef : fn(f32) -> i32, 56 | exp : fn(f64) -> f64, 57 | exp2 : fn(f64) -> f64, 58 | log : fn(f64) -> f64, 59 | log2 : fn(f64) -> f64, 60 | pow : fn(f64, f64) -> f64, 61 | rsqrt : fn(f64) -> f64, 62 | sqrt : fn(f64) -> f64, 63 | fabs : fn(f64) -> f64, 64 | sin : fn(f64) -> f64, 65 | cos : fn(f64) -> f64, 66 | tan : fn(f64) -> f64, 67 | asin : fn(f64) -> f64, 68 | acos : fn(f64) -> f64, 69 | atan : fn(f64) -> f64, 70 | erf : fn(f64) -> f64, 71 | atan2 : fn(f64, f64) -> f64, 72 | copysign : fn(f64, f64) -> f64, 73 | fma : fn(f64, f64, f64) -> f64, 74 | fmax : fn(f64, f64) -> f64, 75 | fmin : fn(f64, f64) -> f64, 76 | fmod : fn(f64, f64) -> f64, 77 | floor : fn(f64) -> f64, 78 | isinf : fn(f64) -> i32, 79 | isnan : fn(f64) -> i32, 80 | isfinite : fn(f64) -> i32, 81 | min : fn(i32, i32) -> i32, 82 | max : fn(i32, i32) -> i32, 83 | } 84 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_cpu.impala: -------------------------------------------------------------------------------- 1 | //#[import(cc = "C", name = "sinf")] fn cpu_sinf(f32) -> f32; 2 | //#[import(cc = "C", name = "cosf")] fn cpu_cosf(f32) -> f32; 3 | #[import(cc = "C", name = "tanf")] fn cpu_tanf(_: f32) -> f32; 4 | #[import(cc = "C", name = "asinf")] fn cpu_asinf(_: f32) -> f32; 5 | #[import(cc = "C", name = "acosf")] fn cpu_acosf(_: f32) -> f32; 6 | #[import(cc = "C", name = "atanf")] fn cpu_atanf(_: f32) -> f32; 7 | #[import(cc = "C", name = "erff")] fn cpu_erff(_: f32) -> f32; 8 | #[import(cc = "C", name = "fmodf")] fn cpu_fmodf(_: f32, _: f32) -> f32; 9 | #[import(cc = "C", name = "atan2f")] fn cpu_atan2f(_: f32, _: f32) -> f32; 10 | #[import(cc = "C", name = "anydsl_isinff")] fn cpu_isinff(_: f32) -> i32; 11 | #[import(cc = "C", name = "anydsl_isnanf")] fn cpu_isnanf(_: f32) -> i32; 12 | #[import(cc = "C", name = "anydsl_isfinitef")] fn cpu_isfinitef(_: f32) -> i32; 13 | //#[import(cc = "C", name = "sin")] fn cpu_sin(f64) -> f64; 14 | //#[import(cc = "C", name = "cos")] fn cpu_cos(f64) -> f64; 15 | #[import(cc = "C", name = "tan")] fn cpu_tan(_: f64) -> f64; 16 | #[import(cc = "C", name = "asin")] fn cpu_asin(_: f64) -> f64; 17 | #[import(cc = "C", name = "acos")] fn cpu_acos(_: f64) -> f64; 18 | #[import(cc = "C", name = "atan")] fn cpu_atan(_: f64) -> f64; 19 | #[import(cc = "C", name = "erf")] fn cpu_erf(_: f64) -> f64; 20 | #[import(cc = "C", name = "fmod")] fn cpu_fmod(_: f64, _: f64) -> f64; 21 | #[import(cc = "C", name = "atan2")] fn cpu_atan2(_: f64, _: f64) -> f64; 22 | #[import(cc = "C", name = "anydsl_isinf")] fn cpu_isinf(_: f64) -> i32; 23 | #[import(cc = "C", name = "anydsl_isnan")] fn cpu_isnan(_: f64) -> i32; 24 | #[import(cc = "C", name = "anydsl_isfinite")] fn cpu_isfinite(_: f64) -> i32; 25 | 26 | #[import(cc = "device", name = "llvm.exp.f32")] fn cpu_expf(_: f32) -> f32; 27 | #[import(cc = "device", name = "llvm.exp2.f32")] fn cpu_exp2f(_: f32) -> f32; 28 | #[import(cc = "device", name = "llvm.log.f32")] fn cpu_logf(_: f32) -> f32; 29 | #[import(cc = "device", name = "llvm.log2.f32")] fn cpu_log2f(_: f32) -> f32; 30 | #[import(cc = "device", name = "llvm.pow.f32")] fn cpu_powf(_: f32, _: f32) -> f32; 31 | #[import(cc = "device", name = "llvm.sqrt.f32")] fn cpu_sqrtf(_: f32) -> f32; 32 | #[import(cc = "device", name = "llvm.fabs.f32")] fn cpu_fabsf(_: f32) -> f32; 33 | #[import(cc = "device", name = "llvm.sin.f32")] fn cpu_sinf(_: f32) -> f32; 34 | #[import(cc = "device", name = "llvm.cos.f32")] fn cpu_cosf(_: f32) -> f32; 35 | #[import(cc = "device", name = "llvm.floor.f32")] fn cpu_floorf(_: f32) -> f32; 36 | #[import(cc = "device", name = "llvm.fma.f32")] fn cpu_fmaf(_: f32, _: f32, _: f32) -> f32; 37 | #[import(cc = "device", name = "llvm.fmuladd.f32")] fn cpu_madf(_: f32, _: f32, _: f32) -> f32; 38 | #[import(cc = "device", name = "llvm.copysign.f32")] fn cpu_copysignf(_: f32, _: f32) -> f32; 39 | #[import(cc = "device", name = "llvm.minnum.f32")] fn cpu_fminf(_: f32, _: f32) -> f32; 40 | #[import(cc = "device", name = "llvm.maxnum.f32")] fn cpu_fmaxf(_: f32, _: f32) -> f32; 41 | #[import(cc = "device", name = "llvm.exp.f64")] fn cpu_exp(_: f64) -> f64; 42 | #[import(cc = "device", name = "llvm.exp2.f64")] fn cpu_exp2(_: f64) -> f64; 43 | #[import(cc = "device", name = "llvm.log.f64")] fn cpu_log(_: f64) -> f64; 44 | #[import(cc = "device", name = "llvm.log2.f64")] fn cpu_log2(_: f64) -> f64; 45 | #[import(cc = "device", name = "llvm.pow.f64")] fn cpu_pow(_: f64, _: f64) -> f64; 46 | #[import(cc = "device", name = "llvm.sqrt.f64")] fn cpu_sqrt(_: f64) -> f64; 47 | #[import(cc = "device", name = "llvm.fabs.f64")] fn cpu_fabs(_: f64) -> f64; 48 | #[import(cc = "device", name = "llvm.sin.f64")] fn cpu_sin(_: f64) -> f64; 49 | #[import(cc = "device", name = "llvm.cos.f64")] fn cpu_cos(_: f64) -> f64; 50 | #[import(cc = "device", name = "llvm.floor.f64")] fn cpu_floor(_: f64) -> f64; 51 | #[import(cc = "device", name = "llvm.fma.f64")] fn cpu_fma(_: f64, _: f64, _: f64) -> f64; 52 | #[import(cc = "device", name = "llvm.fmuladd.f64")] fn cpu_mad(_: f64, _: f64, _: f64) -> f64; 53 | #[import(cc = "device", name = "llvm.copysign.f64")] fn cpu_copysign(_: f64, _: f64) -> f64; 54 | #[import(cc = "device", name = "llvm.minnum.f64")] fn cpu_fmin(_: f64, _: f64) -> f64; 55 | #[import(cc = "device", name = "llvm.maxnum.f64")] fn cpu_fmax(_: f64, _: f64) -> f64; 56 | #[import(cc = "device", name = "llvm.ctpop.i32")] fn cpu_popcount32(_: i32) -> i32; 57 | #[import(cc = "device", name = "llvm.ctpop.i64")] fn cpu_popcount64(_: i64) -> i64; 58 | #[import(cc = "device", name = "llvm.ctlz.i32")] fn cpu_clz32(_: i32, _: bool) -> i32; 59 | #[import(cc = "device", name = "llvm.ctlz.i64")] fn cpu_clz64(_: i64, _: bool) -> i64; 60 | #[import(cc = "device", name = "llvm.cttz.i32")] fn cpu_ctz32(_: i32, _: bool) -> i32; 61 | #[import(cc = "device", name = "llvm.cttz.i64")] fn cpu_ctz64(_: i64, _: bool) -> i64; 62 | #[import(cc = "device", name = "llvm.x86.bmi.pext.32")] fn cpu_pext32(_: i32, _: i32) -> i32; 63 | #[import(cc = "device", name = "llvm.x86.bmi.pext.64")] fn cpu_pext64(_: i64, _: i64) -> i64; 64 | #[import(cc = "device", name = "llvm.x86.bmi.pdep.32")] fn cpu_pdep32(_: i32, _: i32) -> i32; 65 | #[import(cc = "device", name = "llvm.x86.bmi.pdep.64")] fn cpu_pdep64(_: i64, _: i64) -> i64; 66 | #[import(cc = "device", name = "llvm.prefetch.p0")] fn cpu_prefetch(&u8, i32, i32, i32) -> (); 67 | 68 | // 69 | // atomics 70 | // 0 1 2 3 4 5 6 7 8 9 10 11 12 71 | // operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub 72 | // 0 1 2 4 5 6 7 73 | // ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent 74 | // syncscope: singlethread "" (system) 75 | // 76 | 77 | fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 = atomic[i32](0, a, b, 7, ""); 78 | fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32 = atomic[i32](1, a, b, 7, ""); 79 | fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32 = atomic[i32](2, a, b, 7, ""); 80 | fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32 = atomic[i32](7, a, b, 7, ""); 81 | fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32 = atomic[i32](8, a, b, 7, ""); 82 | 83 | static cpu_intrinsics = Intrinsics { 84 | expf = cpu_expf, 85 | exp2f = cpu_exp2f, 86 | logf = cpu_logf, 87 | log2f = cpu_log2f, 88 | powf = cpu_powf, 89 | rsqrtf = @|a| { 1:f32 / cpu_sqrtf(a) }, 90 | sqrtf = cpu_sqrtf, 91 | fabsf = cpu_fabsf, 92 | sinf = cpu_sinf, 93 | cosf = cpu_cosf, 94 | tanf = cpu_tanf, 95 | asinf = cpu_asinf, 96 | acosf = cpu_acosf, 97 | atanf = cpu_atanf, 98 | erff = cpu_erff, 99 | atan2f = cpu_atan2f, 100 | copysignf = cpu_copysignf, 101 | fmaf = cpu_fmaf, 102 | fmaxf = cpu_fmaxf, 103 | fminf = cpu_fminf, 104 | fmodf = cpu_fmodf, 105 | floorf = cpu_floorf, 106 | isinff = cpu_isinff, 107 | isnanf = cpu_isnanf, 108 | isfinitef = cpu_isfinitef, 109 | exp = cpu_exp, 110 | exp2 = cpu_exp2, 111 | log = cpu_log, 112 | log2 = cpu_log2, 113 | pow = cpu_pow, 114 | rsqrt = @|a| { 1.0 / cpu_sqrt(a) }, 115 | sqrt = cpu_sqrt, 116 | fabs = cpu_fabs, 117 | sin = cpu_sin, 118 | cos = cpu_cos, 119 | tan = cpu_tan, 120 | asin = cpu_asin, 121 | acos = cpu_acos, 122 | atan = cpu_atan, 123 | erf = cpu_erf, 124 | atan2 = cpu_atan2, 125 | copysign = cpu_copysign, 126 | fma = cpu_fma, 127 | fmax = cpu_fmax, 128 | fmin = cpu_fmin, 129 | fmod = cpu_fmod, 130 | floor = cpu_floor, 131 | isinf = cpu_isinf, 132 | isnan = cpu_isnan, 133 | isfinite = cpu_isfinite, 134 | min = @|a, b| { if a < b { a } else { b } }, 135 | max = @|a, b| { if a > b { a } else { b } }, 136 | }; 137 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_hls.impala: -------------------------------------------------------------------------------- 1 | // no declarations are emitted for "device" functions 2 | #[import(cc = "C", name = "exp")] fn hls_expf(f32) -> f32; 3 | #[import(cc = "C", name = "exp2")] fn hls_exp2f(f32) -> f32; 4 | #[import(cc = "C", name = "log")] fn hls_logf(f32) -> f32; 5 | #[import(cc = "C", name = "log2")] fn hls_log2f(f32) -> f32; 6 | #[import(cc = "C", name = "pow")] fn hls_powf(f32, f32) -> f32; 7 | #[import(cc = "C", name = "rsqrt")] fn hls_rsqrtf(f32) -> f32; 8 | #[import(cc = "C", name = "sqrt")] fn hls_sqrtf(f32) -> f32; 9 | #[import(cc = "C", name = "fabs")] fn hls_fabsf(f32) -> f32; 10 | #[import(cc = "C", name = "sin")] fn hls_sinf(f32) -> f32; 11 | #[import(cc = "C", name = "cos")] fn hls_cosf(f32) -> f32; 12 | #[import(cc = "C", name = "tan")] fn hls_tanf(f32) -> f32; 13 | #[import(cc = "C", name = "asin")] fn hls_asinf(f32) -> f32; 14 | #[import(cc = "C", name = "acos")] fn hls_acosf(f32) -> f32; 15 | #[import(cc = "C", name = "atan")] fn hls_atanf(f32) -> f32; 16 | #[import(cc = "C", name = "erf")] fn hls_erff(f32) -> f32; 17 | #[import(cc = "C", name = "atan2")] fn hls_atan2f(f32, f32) -> f32; 18 | #[import(cc = "C", name = "fmod")] fn hls_fmodf(f32, f32) -> f32; 19 | #[import(cc = "C", name = "floor")] fn hls_floorf(f32) -> f32; 20 | #[import(cc = "C", name = "isinf")] fn hls_isinff(f32) -> i32; 21 | #[import(cc = "C", name = "isnan")] fn hls_isnanf(f32) -> i32; 22 | #[import(cc = "C", name = "isfinite")] fn hls_isfinitef(f32) -> i32; 23 | #[import(cc = "C", name = "fma")] fn hls_fmaf(f32, f32, f32) -> f32; 24 | #[import(cc = "C", name = "mad")] fn hls_madf(f32, f32, f32) -> f32; 25 | #[import(cc = "C", name = "copysign")] fn hls_copysignf(f32, f32) -> f32; 26 | #[import(cc = "C", name = "exp")] fn hls_exp(f64) -> f64; 27 | #[import(cc = "C", name = "exp2")] fn hls_exp2(f64) -> f64; 28 | #[import(cc = "C", name = "log")] fn hls_log(f64) -> f64; 29 | #[import(cc = "C", name = "log2")] fn hls_log2(f64) -> f64; 30 | #[import(cc = "C", name = "pow")] fn hls_pow(f64, f64) -> f64; 31 | #[import(cc = "C", name = "rsqrt")] fn hls_rsqrt(f64) -> f64; 32 | #[import(cc = "C", name = "sqrt")] fn hls_sqrt(f64) -> f64; 33 | #[import(cc = "C", name = "fabs")] fn hls_fabs(f64) -> f64; 34 | #[import(cc = "C", name = "sin")] fn hls_sin(f64) -> f64; 35 | #[import(cc = "C", name = "cos")] fn hls_cos(f64) -> f64; 36 | #[import(cc = "C", name = "tan")] fn hls_tan(f64) -> f64; 37 | #[import(cc = "C", name = "asin")] fn hls_asin(f64) -> f64; 38 | #[import(cc = "C", name = "acos")] fn hls_acos(f64) -> f64; 39 | #[import(cc = "C", name = "atan")] fn hls_atan(f64) -> f64; 40 | #[import(cc = "C", name = "erf")] fn hls_erf(f64) -> f64; 41 | #[import(cc = "C", name = "atan2")] fn hls_atan2(f64, f64) -> f64; 42 | #[import(cc = "C", name = "fmod")] fn hls_fmod(f64, f64) -> f64; 43 | #[import(cc = "C", name = "floor")] fn hls_floor(f64) -> f64; 44 | #[import(cc = "C", name = "isinf")] fn hls_isinf(f64) -> i32; 45 | #[import(cc = "C", name = "isnan")] fn hls_isnan(f64) -> i32; 46 | #[import(cc = "C", name = "isfinite")] fn hls_isfinite(f64) -> i32; 47 | #[import(cc = "C", name = "fma")] fn hls_fma(f64, f64, f64) -> f64; 48 | #[import(cc = "C", name = "mad")] fn hls_mad(f64, f64, f64) -> f64; 49 | #[import(cc = "C", name = "copysign")] fn hls_copysign(f64, f64) -> f64; 50 | #[import(cc = "C", name = "fmin")] fn hls_fminf(f32, f32) -> f32; 51 | #[import(cc = "C", name = "fmax")] fn hls_fmaxf(f32, f32) -> f32; 52 | #[import(cc = "C", name = "fmin")] fn hls_fmin(f64, f64) -> f64; 53 | #[import(cc = "C", name = "fmax")] fn hls_fmax(f64, f64) -> f64; 54 | #[import(cc = "C", name = "min")] fn hls_min(i32, i32) -> i32; 55 | #[import(cc = "C", name = "max")] fn hls_max(i32, i32) -> i32; 56 | 57 | #[import(cc = "device")] fn print_pragma(&[u8]) -> (); 58 | 59 | // channel scalar types 60 | struct channel[T] { data : T } 61 | // channel array types 62 | struct channel1[T] { data : [T * 1 ] } 63 | struct channel2[T] { data : [T * 2 ] } 64 | struct channel4[T] { data : [T * 4 ] } 65 | struct channel8[T] { data : [T * 8 ] } 66 | struct channel16[T] { data : [T * 16 ] } 67 | struct channel32[T] { data : [T * 32 ] } 68 | struct channel64[T] { data : [T * 64 ] } 69 | struct channel128[T] { data : [T * 128] } 70 | 71 | // read and write on scalar channels 72 | #[import(cc = "device", name = "read_channel")] fn read_channel[T] (&mut channel[T]) -> T; 73 | #[import(cc = "device", name = "write_channel")] fn write_channel[T] (&mut channel[T], T ) -> (); 74 | 75 | // read and write on array channels 76 | #[import(cc = "device", name = "read_channel")] fn read_channel1[T] ( &mut channel1[T] ) -> [T * 1 ]; 77 | #[import(cc = "device", name = "read_channel")] fn read_channel2[T] ( &mut channel2[T] ) -> [T * 2 ]; 78 | #[import(cc = "device", name = "read_channel")] fn read_channel4[T] ( &mut channel4[T] ) -> [T * 4 ]; 79 | #[import(cc = "device", name = "read_channel")] fn read_channel8[T] ( &mut channel8[T] ) -> [T * 8 ]; 80 | #[import(cc = "device", name = "read_channel")] fn read_channel16[T]( &mut channel16[T]) -> [T * 16]; 81 | #[import(cc = "device", name = "read_channel")] fn read_channel32[T]( &mut channel32[T]) -> [T * 32]; 82 | 83 | #[import(cc = "device", name = "write_channel")] fn write_channel1[T] ( &mut channel1[T], [T * 1 ]) -> (); 84 | #[import(cc = "device", name = "write_channel")] fn write_channel2[T] ( &mut channel2[T], [T * 2 ]) -> (); 85 | #[import(cc = "device", name = "write_channel")] fn write_channel4[T] ( &mut channel4[T], [T * 4 ]) -> (); 86 | #[import(cc = "device", name = "write_channel")] fn write_channel8[T] ( &mut channel8[T], [T * 8 ]) -> (); 87 | #[import(cc = "device", name = "write_channel")] fn write_channel16[T]( &mut channel16[T], [T * 16]) -> (); 88 | #[import(cc = "device", name = "write_channel")] fn write_channel32[T]( &mut channel32[T], [T * 32]) -> (); 89 | #[import(cc = "device", name = " ")] fn bitcast_channel[T]( &mut channel1[T]) -> [T * 2]; 90 | 91 | fn @hls_accelerator(dev: i32) = Accelerator { 92 | exec = @|body| |_grid, _block| { 93 | let work_item = WorkItem { 94 | tidx = @|| 0, tidy = @|| 0, tidz = @|| 0, 95 | bidx = @|| 0, bidy = @|| 0, bidz = @|| 0, 96 | gidx = @|| 0, gidy = @|| 0, gidz = @|| 0, 97 | bdimx = @|| 1, bdimy = @|| 1, bdimz = @|| 1, 98 | gdimx = @|| 1, gdimy = @|| 1, gdimz = @|| 1, 99 | nblkx = @|| 1, nblky = @|| 1, nblkz = @|| 1 100 | }; 101 | hls(dev, || @body(work_item)); 102 | }, 103 | sync = @|| synchronize_hls(dev), 104 | alloc = @|size| alloc_hls(dev, size), 105 | alloc_unified = @|size| alloc_hls_unified(dev, size), 106 | barrier = @|| () 107 | }; 108 | 109 | static hls_intrinsics = Intrinsics { 110 | expf = hls_expf, 111 | exp2f = hls_exp2f, 112 | logf = hls_logf, 113 | log2f = hls_log2f, 114 | powf = hls_powf, 115 | rsqrtf = hls_rsqrtf, 116 | sqrtf = hls_sqrtf, 117 | fabsf = hls_fabsf, 118 | sinf = hls_sinf, 119 | cosf = hls_cosf, 120 | tanf = hls_tanf, 121 | asinf = hls_asinf, 122 | acosf = hls_acosf, 123 | atanf = hls_atanf, 124 | erff = hls_erff, 125 | atan2f = hls_atan2f, 126 | copysignf = hls_copysignf, 127 | fmaf = hls_fmaf, 128 | fmaxf = hls_fmaxf, 129 | fminf = hls_fminf, 130 | fmodf = hls_fmodf, 131 | floorf = hls_floorf, 132 | isinff = hls_isinff, 133 | isnanf = hls_isnanf, 134 | isfinitef = hls_isfinitef, 135 | exp = hls_exp, 136 | exp2 = hls_exp2, 137 | log = hls_log, 138 | log2 = hls_log2, 139 | pow = hls_pow, 140 | rsqrt = hls_rsqrt, 141 | sqrt = hls_sqrt, 142 | fabs = hls_fabs, 143 | sin = hls_sin, 144 | cos = hls_cos, 145 | tan = hls_tan, 146 | asin = hls_asin, 147 | acos = hls_acos, 148 | atan = hls_atan, 149 | erf = hls_erf, 150 | atan2 = hls_atan2, 151 | copysign = hls_copysign, 152 | fma = hls_fma, 153 | fmax = hls_fmax, 154 | fmin = hls_fmin, 155 | fmod = hls_fmod, 156 | floor = hls_floor, 157 | isinf = hls_isinf, 158 | isnan = hls_isnan, 159 | isfinite = hls_isfinite, 160 | min = hls_min, 161 | max = hls_max, 162 | }; 163 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_levelzero.impala: -------------------------------------------------------------------------------- 1 | // most device intrinsics are the same as OpenCL and don't need to be duplicated 2 | fn spv_levelzero_get_num_groups() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */); 3 | fn spv_levelzero_get_local_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */); 4 | fn spv_levelzero_get_group_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */); 5 | fn spv_levelzero_get_local_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */); 6 | fn spv_levelzero_get_global_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */); 7 | fn spv_levelzero_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */); 8 | 9 | fn @levelzero_accelerator(dev: i32) = Accelerator { 10 | exec = @|body| |grid, block| { 11 | let work_item = WorkItem { 12 | tidx = @|| spv_levelzero_get_local_id()(0) as i32, 13 | tidy = @|| spv_levelzero_get_local_id()(1) as i32, 14 | tidz = @|| spv_levelzero_get_local_id()(2) as i32, 15 | bidx = @|| spv_levelzero_get_group_id()(0) as i32, 16 | bidy = @|| spv_levelzero_get_group_id()(1) as i32, 17 | bidz = @|| spv_levelzero_get_group_id()(2) as i32, 18 | gidx = @|| spv_levelzero_get_global_id()(0) as i32, 19 | gidy = @|| spv_levelzero_get_global_id()(1) as i32, 20 | gidz = @|| spv_levelzero_get_global_id()(2) as i32, 21 | bdimx = @|| spv_levelzero_get_local_size()(0) as i32, 22 | bdimy = @|| spv_levelzero_get_local_size()(1) as i32, 23 | bdimz = @|| spv_levelzero_get_local_size()(2) as i32, 24 | gdimx = @|| spv_levelzero_get_global_size()(0) as i32, 25 | gdimy = @|| spv_levelzero_get_global_size()(1) as i32, 26 | gdimz = @|| spv_levelzero_get_global_size()(2) as i32, 27 | nblkx = @|| spv_levelzero_get_num_groups()(0) as i32, 28 | nblky = @|| spv_levelzero_get_num_groups()(1) as i32, 29 | nblkz = @|| spv_levelzero_get_num_groups()(2) as i32 30 | }; 31 | levelzero(dev, grid, block, || @body(work_item)) 32 | }, 33 | sync = @|| synchronize_levelzero(dev), 34 | alloc = @|size| alloc_levelzero(dev, size), 35 | alloc_unified = @|size| alloc_levelzero_unified(dev, size), 36 | barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE), 37 | }; 38 | 39 | static levelzero_intrinsics = Intrinsics { 40 | expf = opencl_expf, 41 | exp2f = opencl_exp2f, 42 | logf = opencl_logf, 43 | log2f = opencl_log2f, 44 | powf = opencl_powf, 45 | rsqrtf = opencl_rsqrtf, 46 | sqrtf = opencl_sqrtf, 47 | fabsf = opencl_fabsf, 48 | sinf = opencl_sinf, 49 | cosf = opencl_cosf, 50 | tanf = opencl_tanf, 51 | asinf = opencl_asinf, 52 | acosf = opencl_acosf, 53 | atanf = opencl_atanf, 54 | erff = opencl_erff, 55 | atan2f = opencl_atan2f, 56 | copysignf = opencl_copysignf, 57 | fmaf = opencl_fmaf, 58 | fmaxf = opencl_fmaxf, 59 | fminf = opencl_fminf, 60 | fmodf = opencl_fmodf, 61 | floorf = opencl_floorf, 62 | isinff = opencl_isinff, 63 | isnanf = opencl_isnanf, 64 | isfinitef = opencl_isfinitef, 65 | exp = opencl_exp, 66 | exp2 = opencl_exp2, 67 | log = opencl_log, 68 | log2 = opencl_log2, 69 | pow = opencl_pow, 70 | rsqrt = opencl_rsqrt, 71 | sqrt = opencl_sqrt, 72 | fabs = opencl_fabs, 73 | sin = opencl_sin, 74 | cos = opencl_cos, 75 | tan = opencl_tan, 76 | asin = opencl_asin, 77 | acos = opencl_acos, 78 | atan = opencl_atan, 79 | erf = opencl_erf, 80 | atan2 = opencl_atan2, 81 | copysign = opencl_copysign, 82 | fma = opencl_fma, 83 | fmax = opencl_fmax, 84 | fmin = opencl_fmin, 85 | fmod = opencl_fmod, 86 | floor = opencl_floor, 87 | isinf = opencl_isinf, 88 | isnan = opencl_isnan, 89 | isfinite = opencl_isfinite, 90 | min = opencl_min, 91 | max = opencl_max, 92 | }; 93 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_math.impala: -------------------------------------------------------------------------------- 1 | mod math_builtins { 2 | 3 | #[import(cc = "builtin")] fn fabs[T](T) -> T; 4 | #[import(cc = "builtin")] fn copysign[T](T, T) -> T; 5 | #[import(cc = "builtin")] fn signbit[T](T) -> bool; 6 | #[import(cc = "builtin")] fn round[T](T) -> T; 7 | #[import(cc = "builtin")] fn ceil[T](T) -> T; 8 | #[import(cc = "builtin")] fn floor[T](T) -> T; 9 | #[import(cc = "builtin")] fn fmin[T](T, T) -> T; 10 | #[import(cc = "builtin")] fn fmax[T](T, T) -> T; 11 | #[import(cc = "builtin")] fn cos[T](T) -> T; 12 | #[import(cc = "builtin")] fn sin[T](T) -> T; 13 | #[import(cc = "builtin")] fn tan[T](T) -> T; 14 | #[import(cc = "builtin")] fn acos[T](T) -> T; 15 | #[import(cc = "builtin")] fn asin[T](T) -> T; 16 | #[import(cc = "builtin")] fn atan[T](T) -> T; 17 | #[import(cc = "builtin")] fn atan2[T](T, T) -> T; 18 | #[import(cc = "builtin")] fn sqrt[T](T) -> T; 19 | #[import(cc = "builtin")] fn cbrt[T](T) -> T; 20 | #[import(cc = "builtin")] fn pow[T](T, T) -> T; 21 | #[import(cc = "builtin")] fn exp[T](T) -> T; 22 | #[import(cc = "builtin")] fn exp2[T](T) -> T; 23 | #[import(cc = "builtin")] fn log[T](T) -> T; 24 | #[import(cc = "builtin")] fn log2[T](T) -> T; 25 | #[import(cc = "builtin")] fn log10[T](T) -> T; 26 | #[import(cc = "builtin")] fn isnan[T](T) -> bool; 27 | #[import(cc = "builtin")] fn isfinite[T](T) -> bool; 28 | 29 | } 30 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_opencl.impala: -------------------------------------------------------------------------------- 1 | // no declarations are emitted for "device" functions 2 | #[import(cc = "device", name = "barrier")] fn opencl_barrier(u32) -> (); 3 | #[import(cc = "device", name = "exp")] fn opencl_expf(f32) -> f32; 4 | #[import(cc = "device", name = "exp2")] fn opencl_exp2f(f32) -> f32; 5 | #[import(cc = "device", name = "log")] fn opencl_logf(f32) -> f32; 6 | #[import(cc = "device", name = "log2")] fn opencl_log2f(f32) -> f32; 7 | #[import(cc = "device", name = "pow")] fn opencl_powf(f32, f32) -> f32; 8 | #[import(cc = "device", name = "rsqrt")] fn opencl_rsqrtf(f32) -> f32; 9 | #[import(cc = "device", name = "sqrt")] fn opencl_sqrtf(f32) -> f32; 10 | #[import(cc = "device", name = "fabs")] fn opencl_fabsf(f32) -> f32; 11 | #[import(cc = "device", name = "sin")] fn opencl_sinf(f32) -> f32; 12 | #[import(cc = "device", name = "cos")] fn opencl_cosf(f32) -> f32; 13 | #[import(cc = "device", name = "tan")] fn opencl_tanf(f32) -> f32; 14 | #[import(cc = "device", name = "asin")] fn opencl_asinf(f32) -> f32; 15 | #[import(cc = "device", name = "acos")] fn opencl_acosf(f32) -> f32; 16 | #[import(cc = "device", name = "atan")] fn opencl_atanf(f32) -> f32; 17 | #[import(cc = "device", name = "erf")] fn opencl_erff(f32) -> f32; 18 | #[import(cc = "device", name = "atan2")] fn opencl_atan2f(f32, f32) -> f32; 19 | #[import(cc = "device", name = "fmod")] fn opencl_fmodf(f32, f32) -> f32; 20 | #[import(cc = "device", name = "floor")] fn opencl_floorf(f32) -> f32; 21 | #[import(cc = "device", name = "isinf")] fn opencl_isinff(f32) -> i32; 22 | #[import(cc = "device", name = "isnan")] fn opencl_isnanf(f32) -> i32; 23 | #[import(cc = "device", name = "isfinite")] fn opencl_isfinitef(f32) -> i32; 24 | #[import(cc = "device", name = "fma")] fn opencl_fmaf(f32, f32, f32) -> f32; 25 | #[import(cc = "device", name = "mad")] fn opencl_madf(f32, f32, f32) -> f32; 26 | #[import(cc = "device", name = "copysign")] fn opencl_copysignf(f32, f32) -> f32; 27 | #[import(cc = "device", name = "exp")] fn opencl_exp(f64) -> f64; 28 | #[import(cc = "device", name = "exp2")] fn opencl_exp2(f64) -> f64; 29 | #[import(cc = "device", name = "log")] fn opencl_log(f64) -> f64; 30 | #[import(cc = "device", name = "log2")] fn opencl_log2(f64) -> f64; 31 | #[import(cc = "device", name = "pow")] fn opencl_pow(f64, f64) -> f64; 32 | #[import(cc = "device", name = "rsqrt")] fn opencl_rsqrt(f64) -> f64; 33 | #[import(cc = "device", name = "sqrt")] fn opencl_sqrt(f64) -> f64; 34 | #[import(cc = "device", name = "fabs")] fn opencl_fabs(f64) -> f64; 35 | #[import(cc = "device", name = "sin")] fn opencl_sin(f64) -> f64; 36 | #[import(cc = "device", name = "cos")] fn opencl_cos(f64) -> f64; 37 | #[import(cc = "device", name = "tan")] fn opencl_tan(f64) -> f64; 38 | #[import(cc = "device", name = "asin")] fn opencl_asin(f64) -> f64; 39 | #[import(cc = "device", name = "acos")] fn opencl_acos(f64) -> f64; 40 | #[import(cc = "device", name = "atan")] fn opencl_atan(f64) -> f64; 41 | #[import(cc = "device", name = "erf")] fn opencl_erf(f64) -> f64; 42 | #[import(cc = "device", name = "atan2")] fn opencl_atan2(f64, f64) -> f64; 43 | #[import(cc = "device", name = "fmod")] fn opencl_fmod(f64, f64) -> f64; 44 | #[import(cc = "device", name = "floor")] fn opencl_floor(f64) -> f64; 45 | #[import(cc = "device", name = "isinf")] fn opencl_isinf(f64) -> i32; 46 | #[import(cc = "device", name = "isnan")] fn opencl_isnan(f64) -> i32; 47 | #[import(cc = "device", name = "isfinite")] fn opencl_isfinite(f64) -> i32; 48 | #[import(cc = "device", name = "fma")] fn opencl_fma(f64, f64, f64) -> f64; 49 | #[import(cc = "device", name = "mad")] fn opencl_mad(f64, f64, f64) -> f64; 50 | #[import(cc = "device", name = "copysign")] fn opencl_copysign(f64, f64) -> f64; 51 | #[import(cc = "device", name = "fmin")] fn opencl_fminf(f32, f32) -> f32; 52 | #[import(cc = "device", name = "fmax")] fn opencl_fmaxf(f32, f32) -> f32; 53 | #[import(cc = "device", name = "fmin")] fn opencl_fmin(f64, f64) -> f64; 54 | #[import(cc = "device", name = "fmax")] fn opencl_fmax(f64, f64) -> f64; 55 | #[import(cc = "device", name = "min")] fn opencl_min(i32, i32) -> i32; 56 | #[import(cc = "device", name = "max")] fn opencl_max(i32, i32) -> i32; 57 | #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global(&mut addrspace(1)i32, i32) -> i32; 58 | #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global_f32(&mut addrspace(1)f32, f32) -> f32; 59 | #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32; 60 | #[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_global(&mut addrspace(1)i32, i32) -> i32; 61 | #[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32; 62 | #[import(cc = "device", name = "get_work_dim")] fn opencl_get_work_dim() -> u32; 63 | #[import(cc = "device", name = "get_global_size")] fn opencl_get_global_size(u32) -> u64; 64 | #[import(cc = "device", name = "get_global_id")] fn opencl_get_global_id(u32) -> u64; 65 | #[import(cc = "device", name = "get_local_size")] fn opencl_get_local_size(u32) -> u64; 66 | #[import(cc = "device", name = "get_local_id")] fn opencl_get_local_id(u32) -> u64; 67 | #[import(cc = "device", name = "get_num_groups")] fn opencl_get_num_groups(u32) -> u64; 68 | #[import(cc = "device", name = "get_group_id")] fn opencl_get_group_id(u32) -> u64; 69 | #[import(cc = "device", name = "get_global_offset")] fn opencl_get_global_offset(u32) -> u64; 70 | 71 | static CLK_LOCAL_MEM_FENCE = 1:u32; 72 | static CLK_GLOBAL_MEM_FENCE = 2:u32; 73 | 74 | fn @opencl_accelerator(dev: i32) = Accelerator { 75 | exec = @|body| |grid, block| { 76 | let work_item = WorkItem { 77 | tidx = @|| opencl_get_local_id(0) as i32, 78 | tidy = @|| opencl_get_local_id(1) as i32, 79 | tidz = @|| opencl_get_local_id(2) as i32, 80 | bidx = @|| opencl_get_group_id(0) as i32, 81 | bidy = @|| opencl_get_group_id(1) as i32, 82 | bidz = @|| opencl_get_group_id(2) as i32, 83 | gidx = @|| opencl_get_global_id(0) as i32, 84 | gidy = @|| opencl_get_global_id(1) as i32, 85 | gidz = @|| opencl_get_global_id(2) as i32, 86 | bdimx = @|| opencl_get_local_size(0) as i32, 87 | bdimy = @|| opencl_get_local_size(1) as i32, 88 | bdimz = @|| opencl_get_local_size(2) as i32, 89 | gdimx = @|| opencl_get_global_size(0) as i32, 90 | gdimy = @|| opencl_get_global_size(1) as i32, 91 | gdimz = @|| opencl_get_global_size(2) as i32, 92 | nblkx = @|| opencl_get_num_groups(0) as i32, 93 | nblky = @|| opencl_get_num_groups(1) as i32, 94 | nblkz = @|| opencl_get_num_groups(2) as i32 95 | }; 96 | opencl(dev, grid, block, || @body(work_item)) 97 | }, 98 | sync = @|| synchronize_opencl(dev), 99 | alloc = @|size| alloc_opencl(dev, size), 100 | alloc_unified = @|size| alloc_opencl_unified(dev, size), 101 | barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE), 102 | }; 103 | 104 | fn spv_cl_get_num_groups() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */); 105 | fn spv_cl_get_local_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */); 106 | fn spv_cl_get_group_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */); 107 | fn spv_cl_get_local_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */); 108 | fn spv_cl_get_global_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */); 109 | fn spv_cl_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */); 110 | 111 | fn @opencl_spirv_accelerator(dev: i32) = Accelerator { 112 | exec = @|body| |grid, block| { 113 | let work_item = WorkItem { 114 | tidx = @|| spv_cl_get_local_id()(0) as i32, 115 | tidy = @|| spv_cl_get_local_id()(1) as i32, 116 | tidz = @|| spv_cl_get_local_id()(2) as i32, 117 | bidx = @|| spv_cl_get_group_id()(0) as i32, 118 | bidy = @|| spv_cl_get_group_id()(1) as i32, 119 | bidz = @|| spv_cl_get_group_id()(2) as i32, 120 | gidx = @|| spv_cl_get_global_id()(0) as i32, 121 | gidy = @|| spv_cl_get_global_id()(1) as i32, 122 | gidz = @|| spv_cl_get_global_id()(2) as i32, 123 | bdimx = @|| spv_cl_get_local_size()(0) as i32, 124 | bdimy = @|| spv_cl_get_local_size()(1) as i32, 125 | bdimz = @|| spv_cl_get_local_size()(2) as i32, 126 | gdimx = @|| spv_cl_get_global_size()(0) as i32, 127 | gdimy = @|| spv_cl_get_global_size()(1) as i32, 128 | gdimz = @|| spv_cl_get_global_size()(2) as i32, 129 | nblkx = @|| spv_cl_get_num_groups()(0) as i32, 130 | nblky = @|| spv_cl_get_num_groups()(1) as i32, 131 | nblkz = @|| spv_cl_get_num_groups()(2) as i32 132 | }; 133 | opencl_spirv(dev, grid, block, || @body(work_item)) 134 | }, 135 | sync = @|| synchronize_opencl(dev), 136 | alloc = @|size| alloc_opencl(dev, size), 137 | alloc_unified = @|size| alloc_opencl_unified(dev, size), 138 | barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE), 139 | }; 140 | 141 | static opencl_intrinsics = Intrinsics { 142 | expf = opencl_expf, 143 | exp2f = opencl_exp2f, 144 | logf = opencl_logf, 145 | log2f = opencl_log2f, 146 | powf = opencl_powf, 147 | rsqrtf = opencl_rsqrtf, 148 | sqrtf = opencl_sqrtf, 149 | fabsf = opencl_fabsf, 150 | sinf = opencl_sinf, 151 | cosf = opencl_cosf, 152 | tanf = opencl_tanf, 153 | asinf = opencl_asinf, 154 | acosf = opencl_acosf, 155 | atanf = opencl_atanf, 156 | erff = opencl_erff, 157 | atan2f = opencl_atan2f, 158 | copysignf = opencl_copysignf, 159 | fmaf = opencl_fmaf, 160 | fmaxf = opencl_fmaxf, 161 | fminf = opencl_fminf, 162 | fmodf = opencl_fmodf, 163 | floorf = opencl_floorf, 164 | isinff = opencl_isinff, 165 | isnanf = opencl_isnanf, 166 | isfinitef = opencl_isfinitef, 167 | exp = opencl_exp, 168 | exp2 = opencl_exp2, 169 | log = opencl_log, 170 | log2 = opencl_log2, 171 | pow = opencl_pow, 172 | rsqrt = opencl_rsqrt, 173 | sqrt = opencl_sqrt, 174 | fabs = opencl_fabs, 175 | sin = opencl_sin, 176 | cos = opencl_cos, 177 | tan = opencl_tan, 178 | asin = opencl_asin, 179 | acos = opencl_acos, 180 | atan = opencl_atan, 181 | erf = opencl_erf, 182 | atan2 = opencl_atan2, 183 | copysign = opencl_copysign, 184 | fma = opencl_fma, 185 | fmax = opencl_fmax, 186 | fmin = opencl_fmin, 187 | fmod = opencl_fmod, 188 | floor = opencl_floor, 189 | isinf = opencl_isinf, 190 | isnan = opencl_isnan, 191 | isfinite = opencl_isfinite, 192 | min = opencl_min, 193 | max = opencl_max, 194 | }; 195 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_rv.impala: -------------------------------------------------------------------------------- 1 | #[import(cc = "C")] fn rv_mask() -> bool; 2 | #[import(cc = "C")] fn rv_any(_: bool) -> bool; 3 | #[import(cc = "C")] fn rv_all(_: bool) -> bool; 4 | #[import(cc = "C")] fn rv_ballot(_: bool) -> i32; 5 | #[import(cc = "C")] fn rv_extract(_: f32, _: i32) -> f32; 6 | #[import(cc = "C")] fn rv_insert(_: f32, _: i32, _: f32) -> f32; 7 | #[import(cc = "C")] fn rv_load(_: &f32, _: i32) -> f32; 8 | #[import(cc = "C")] fn rv_store(_: &mut f32, _: i32, _: f32) -> (); 9 | #[import(cc = "C")] fn rv_shuffle(_: f32, _: i32) -> f32; 10 | #[import(cc = "C")] fn rv_align(_: &i8, _: i32)-> &i8; 11 | #[import(cc = "C")] fn rv_compact(_: f32, _: bool) -> f32; 12 | #[import(cc = "C")] fn rv_lane_id() -> i32; 13 | #[import(cc = "C")] fn rv_num_lanes() -> i32; 14 | -------------------------------------------------------------------------------- /platforms/artic/intrinsics_spirv.impala: -------------------------------------------------------------------------------- 1 | #[import(cc = "device", name = "spirv.builtin")] fn spirv_get_builtin[T](i32) -> T; -------------------------------------------------------------------------------- /platforms/artic/intrinsics_thorin.impala: -------------------------------------------------------------------------------- 1 | #[import(cc = "builtin")] fn undef[T]() -> T; 2 | #[import(cc = "builtin")] fn sizeof[_]() -> i64; 3 | #[import(cc = "builtin")] fn alignof[_]() -> i64; 4 | #[import(cc = "builtin")] fn bitcast[T, U](_src: U) -> T; 5 | #[import(cc = "builtin")] fn select[T, U](_cond: T, _true: U, _false: U) -> U; 6 | #[import(cc = "builtin")] fn insert[T, U](_tuple: T, _index: i32, _value: U) -> T; 7 | 8 | #[import(cc = "thorin")] fn atomic[T](_binop: u32, _addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub 9 | #[import(cc = "thorin")] fn atomic_load[T](_addr: &T, _order: u32, _scope: &[u8]) -> T; 10 | #[import(cc = "thorin")] fn atomic_store[T](_addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> (); 11 | #[import(cc = "thorin")] fn cmpxchg[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types 12 | #[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types 13 | #[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> (); 14 | #[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> (); 15 | #[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 16 | #[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 17 | #[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 18 | #[import(cc = "thorin")] fn opencl_spirv(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 19 | #[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 20 | #[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 21 | #[import(cc = "thorin")] fn levelzero(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); 22 | #[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T]; 23 | #[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> (); 24 | #[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend 25 | #[import(cc = "thorin", name = "parallel")] fn thorin_parallel(_num_threads: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); 26 | #[import(cc = "thorin", name = "spawn")] fn thorin_spawn(_body: fn() -> ()) -> i32; 27 | #[import(cc = "thorin")] fn sync(_id: i32) -> (); 28 | #[import(cc = "thorin")] fn vectorize(_vector_length: i32, _body: fn(i32) -> ()) -> (); 29 | 30 | #[import(cc = "thorin", name = "atomic")] fn atomic_p1[T](_binop: u32, _addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> T; 31 | #[import(cc = "thorin", name = "atomic")] fn atomic_p3[T](_binop: u32, _addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> T; 32 | #[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p1[T](_addr: &addrspace(1)T, _order: u32, _scope: &[u8]) -> T; 33 | #[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p3[T](_addr: &addrspace(3)T, _order: u32, _scope: &[u8]) -> T; 34 | #[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p1[T](_addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> (); 35 | #[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p3[T](_addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> (); 36 | #[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); 37 | #[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); 38 | #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); 39 | #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); 40 | 41 | fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body); 42 | fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body); 43 | fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body); 44 | -------------------------------------------------------------------------------- /platforms/artic/runtime.impala: -------------------------------------------------------------------------------- 1 | #[import(cc = "C", name = "anydsl_info")] fn runtime_info() -> (); 2 | #[import(cc = "C", name = "anydsl_device_name")] fn runtime_device_name(_device: i32) -> &[u8]; 3 | #[import(cc = "C", name = "anydsl_device_check_feature_support")] fn runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool; 4 | 5 | #[import(cc = "C", name = "anydsl_alloc")] fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8]; 6 | #[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8]; 7 | #[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8]; 8 | #[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> (); 9 | #[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8]; 10 | #[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> (); 11 | #[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> (); 12 | #[import(cc = "C", name = "anydsl_release_host")] fn runtime_release_host(_device: i32, _ptr: &[i8]) -> (); 13 | 14 | #[import(cc = "C", name = "anydsl_random_seed")] fn random_seed(_: u32) -> (); 15 | #[import(cc = "C", name = "anydsl_random_val_f32")] fn random_val_f32() -> f32; 16 | #[import(cc = "C", name = "anydsl_random_val_u64")] fn random_val_u64() -> u64; 17 | 18 | #[import(cc = "C", name = "anydsl_get_micro_time")] fn get_micro_time() -> i64; 19 | #[import(cc = "C", name = "anydsl_get_nano_time")] fn get_nano_time() -> i64; 20 | #[import(cc = "C", name = "anydsl_get_kernel_time")] fn get_kernel_time() -> i64; 21 | 22 | #[import(cc = "C", name = "anydsl_print_i16")] fn print_i16(_: i16) -> (); 23 | #[import(cc = "C", name = "anydsl_print_i32")] fn print_i32(_: i32) -> (); 24 | #[import(cc = "C", name = "anydsl_print_i64")] fn print_i64(_: i64) -> (); 25 | #[import(cc = "C", name = "anydsl_print_u16")] fn print_u16(_: u16) -> (); 26 | #[import(cc = "C", name = "anydsl_print_u32")] fn print_u32(_: u32) -> (); 27 | #[import(cc = "C", name = "anydsl_print_u64")] fn print_u64(_: u64) -> (); 28 | #[import(cc = "C", name = "anydsl_print_f32")] fn print_f32(_: f32) -> (); 29 | #[import(cc = "C", name = "anydsl_print_f64")] fn print_f64(_: f64) -> (); 30 | #[import(cc = "C", name = "anydsl_print_char")] fn print_char(_: u8) -> (); 31 | #[import(cc = "C", name = "anydsl_print_string")] fn print_string(_: &[u8]) -> (); 32 | #[import(cc = "C", name = "anydsl_print_flush")] fn print_flush() -> (); 33 | 34 | // TODO 35 | //struct Buffer[T] { 36 | // data : &mut [T], 37 | // size : i64, 38 | // device : i32 39 | //} 40 | // 41 | //fn @alloc[T](device: i32, size: i64) = Buffer[T] { 42 | // data = runtime_alloc(device, size * sizeof[T]()) as &mut [T], 43 | // size = size, 44 | // device = device 45 | //}; 46 | //fn @alloc_host[T](device: i32, size: i64) = Buffer[T] { 47 | // data = runtime_alloc_host(device, size * sizeof[T]()) as &mut [T], 48 | // size = size, 49 | // device = device 50 | //}; 51 | //fn @alloc_unified[T](device: i32, size: i64) = Buffer[T] { 52 | // data = runtime_alloc_unified(device, size * sizeof[T]()) as &mut [T], 53 | // size = size, 54 | // device = device 55 | //}; 56 | // 57 | //fn @release[T](buf: Buffer[T]) = runtime_release(buf.device, buf.data as &[i8]); 58 | //fn @alloc_cpu[T](size: i64) = alloc[T](0, size); 59 | //fn @alloc_cuda[T](dev: i32, size: i64) = alloc[T](runtime_device(1, dev), size); 60 | //fn @alloc_cuda_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(1, dev), size); 61 | //fn @alloc_cuda_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(1, dev), size); 62 | //fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev)); 63 | //fn @alloc_opencl[T](dev: i32, size: i64) = alloc[T](runtime_device(2, dev), size); 64 | //fn @alloc_opencl_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(2, dev), size); 65 | //fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev)); 66 | //fn @alloc_hsa[T](dev: i32, size: i64) = alloc[T](runtime_device(3, dev), size); 67 | //fn @alloc_hsa_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(3, dev), size); 68 | //fn @alloc_hsa_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(3, dev), size); 69 | //fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev)); 70 | //fn @alloc_pal[T](dev: i32, size: i64) = alloc[T](runtime_device(4, dev), size); 71 | //fn @alloc_pal_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(4, dev), size); 72 | //fn @alloc_pal_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(4, dev), size); 73 | //fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev)); 74 | // 75 | //fn @copy[T](src: Buffer[T], dst: Buffer[T]) = runtime_copy(src.device, src.data as &[i8], 0, dst.device, dst.data as &mut [i8], 0, src.size); 76 | //fn @copy_offset[T](src: Buffer[T], off_src: i64, dst: Buffer[T], off_dst: i64, size: i64) = runtime_copy(src.device, src.data as &[i8], off_src, dst.device, dst.data as &mut [i8], off_dst, size); 77 | 78 | struct Buffer { 79 | data : &mut [i8], 80 | size : i64, 81 | device : i32 82 | } 83 | 84 | fn @alloc(device: i32, size: i64) = Buffer { 85 | data = runtime_alloc(device, size), 86 | size = size, 87 | device = device 88 | }; 89 | fn @alloc_host(device: i32, size: i64) = Buffer { 90 | data = runtime_alloc_host(device, size), 91 | size = size, 92 | device = device 93 | }; 94 | fn @alloc_unified(device: i32, size: i64) = Buffer { 95 | data = runtime_alloc_unified(device, size), 96 | size = size, 97 | device = device 98 | }; 99 | fn @release(buf: Buffer) = runtime_release(buf.device, buf.data); 100 | 101 | fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) } 102 | 103 | fn @alloc_cpu(size: i64) = alloc(0, size); 104 | fn @alloc_cuda(dev: i32, size: i64) = alloc(runtime_device(1, dev), size); 105 | fn @alloc_cuda_host(dev: i32, size: i64) = alloc_host(runtime_device(1, dev), size); 106 | fn @alloc_cuda_unified(dev: i32, size: i64) = alloc_unified(runtime_device(1, dev), size); 107 | fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev)); 108 | fn @alloc_opencl(dev: i32, size: i64) = alloc(runtime_device(2, dev), size); 109 | fn @alloc_opencl_unified(dev: i32, size: i64) = alloc_unified(runtime_device(2, dev), size); 110 | fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev)); 111 | fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) } 112 | fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) } 113 | fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) } 114 | fn @alloc_hsa(dev: i32, size: i64) = alloc(runtime_device(3, dev), size); 115 | fn @alloc_hsa_host(dev: i32, size: i64) = alloc_host(runtime_device(3, dev), size); 116 | fn @alloc_hsa_unified(dev: i32, size: i64) = alloc_unified(runtime_device(3, dev), size); 117 | fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev)); 118 | fn @alloc_pal(dev: i32, size: i64) = alloc(runtime_device(4, dev), size); 119 | fn @alloc_pal_host(dev: i32, size: i64) = alloc_host(runtime_device(4, dev), size); 120 | fn @alloc_pal_unified(dev: i32, size: i64) = alloc_unified(runtime_device(4, dev), size); 121 | fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev)); 122 | fn @alloc_levelzero(dev: i32, size: i64) = alloc(runtime_device(5, dev), size); 123 | fn @alloc_levelzero_host(dev: i32, size: i64) = alloc_host(runtime_device(5, dev), size); 124 | fn @alloc_levelzero_unified(dev: i32, size: i64) = alloc_unified(runtime_device(5, dev), size); 125 | fn @synchronize_levelzero(dev: i32) = runtime_synchronize(runtime_device(5, dev)); 126 | 127 | fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size); 128 | fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size); 129 | 130 | 131 | // range, range_step, unroll, unroll_step, etc. 132 | fn @unroll_step(body: fn(i32) -> ()) { 133 | fn @(?beg & ?end & ?step) loop(beg: i32, end: i32, step: i32) -> () { 134 | if beg < end { 135 | @body(beg); 136 | loop(beg + step, end, step) 137 | } 138 | } 139 | loop 140 | } 141 | 142 | fn @unroll_step_rev(body: fn(i32) -> ()) { 143 | fn @(?beg & ?end & ?step) loop(end: i32, beg: i32, step: i32) -> () { 144 | if end > beg { 145 | @body(end); 146 | loop(end - step, beg, step) 147 | } 148 | } 149 | loop 150 | } 151 | 152 | fn @range(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)($lower, $upper, 1); 153 | fn @range_step(body: fn(i32) -> ()) = @|lower: i32, upper: i32, step: i32| unroll_step(body)($lower, $upper, step); 154 | fn @range_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1); 155 | 156 | fn @unroll(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)(lower, upper, 1); 157 | fn @unroll_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1); 158 | -------------------------------------------------------------------------------- /platforms/impala/intrinsics.impala: -------------------------------------------------------------------------------- 1 | struct WorkItem { 2 | tidx : fn() -> i32, 3 | tidy : fn() -> i32, 4 | tidz : fn() -> i32, 5 | bidx : fn() -> i32, 6 | bidy : fn() -> i32, 7 | bidz : fn() -> i32, 8 | gidx : fn() -> i32, 9 | gidy : fn() -> i32, 10 | gidz : fn() -> i32, 11 | bdimx : fn() -> i32, 12 | bdimy : fn() -> i32, 13 | bdimz : fn() -> i32, 14 | gdimx : fn() -> i32, 15 | gdimy : fn() -> i32, 16 | gdimz : fn() -> i32, 17 | nblkx : fn() -> i32, 18 | nblky : fn() -> i32, 19 | nblkz : fn() -> i32 20 | } 21 | 22 | struct Accelerator { 23 | exec : fn((i32, i32, i32), // grid 24 | (i32, i32, i32), // block 25 | fn(WorkItem) -> ()) -> (), 26 | sync : fn() -> (), 27 | alloc : fn(i64) -> Buffer, 28 | alloc_unified : fn(i64) -> Buffer, 29 | barrier : fn() -> () 30 | } 31 | 32 | struct Intrinsics { 33 | expf : fn(f32) -> f32, 34 | exp2f : fn(f32) -> f32, 35 | logf : fn(f32) -> f32, 36 | log2f : fn(f32) -> f32, 37 | powf : fn(f32, f32) -> f32, 38 | rsqrtf : fn(f32) -> f32, 39 | sqrtf : fn(f32) -> f32, 40 | fabsf : fn(f32) -> f32, 41 | sinf : fn(f32) -> f32, 42 | cosf : fn(f32) -> f32, 43 | tanf : fn(f32) -> f32, 44 | asinf : fn(f32) -> f32, 45 | acosf : fn(f32) -> f32, 46 | atanf : fn(f32) -> f32, 47 | erff : fn(f32) -> f32, 48 | atan2f : fn(f32, f32) -> f32, 49 | copysignf : fn(f32, f32) -> f32, 50 | fmaf : fn(f32, f32, f32) -> f32, 51 | fmaxf : fn(f32, f32) -> f32, 52 | fminf : fn(f32, f32) -> f32, 53 | fmodf : fn(f32, f32) -> f32, 54 | floorf : fn(f32) -> f32, 55 | isinff : fn(f32) -> i32, 56 | isnanf : fn(f32) -> i32, 57 | isfinitef : fn(f32) -> i32, 58 | exp : fn(f64) -> f64, 59 | exp2 : fn(f64) -> f64, 60 | log : fn(f64) -> f64, 61 | log2 : fn(f64) -> f64, 62 | pow : fn(f64, f64) -> f64, 63 | rsqrt : fn(f64) -> f64, 64 | sqrt : fn(f64) -> f64, 65 | fabs : fn(f64) -> f64, 66 | sin : fn(f64) -> f64, 67 | cos : fn(f64) -> f64, 68 | tan : fn(f64) -> f64, 69 | asin : fn(f64) -> f64, 70 | acos : fn(f64) -> f64, 71 | atan : fn(f64) -> f64, 72 | erf : fn(f64) -> f64, 73 | atan2 : fn(f64, f64) -> f64, 74 | copysign : fn(f64, f64) -> f64, 75 | fma : fn(f64, f64, f64) -> f64, 76 | fmax : fn(f64, f64) -> f64, 77 | fmin : fn(f64, f64) -> f64, 78 | fmod : fn(f64, f64) -> f64, 79 | floor : fn(f64) -> f64, 80 | isinf : fn(f64) -> i32, 81 | isnan : fn(f64) -> i32, 82 | isfinite : fn(f64) -> i32, 83 | min : fn(i32, i32) -> i32, 84 | max : fn(i32, i32) -> i32, 85 | } 86 | -------------------------------------------------------------------------------- /platforms/impala/intrinsics_cpu.impala: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | //fn "sinf" cpu_sinf(f32) -> f32; 3 | //fn "cosf" cpu_cosf(f32) -> f32; 4 | fn "tanf" cpu_tanf(f32) -> f32; 5 | fn "asinf" cpu_asinf(f32) -> f32; 6 | fn "acosf" cpu_acosf(f32) -> f32; 7 | fn "atanf" cpu_atanf(f32) -> f32; 8 | fn "erff" cpu_erff(f32) -> f32; 9 | fn "fmodf" cpu_fmodf(f32, f32) -> f32; 10 | fn "atan2f" cpu_atan2f(f32, f32) -> f32; 11 | fn "anydsl_isinff" cpu_isinff(f32) -> i32; 12 | fn "anydsl_isnanf" cpu_isnanf(f32) -> i32; 13 | fn "anydsl_isfinitef" cpu_isfinitef(f32) -> i32; 14 | //fn "sin" cpu_sin(f64) -> f64; 15 | //fn "cos" cpu_cos(f64) -> f64; 16 | fn "tan" cpu_tan(f64) -> f64; 17 | fn "asin" cpu_asin(f64) -> f64; 18 | fn "acos" cpu_acos(f64) -> f64; 19 | fn "atan" cpu_atan(f64) -> f64; 20 | fn "erf" cpu_erf(f64) -> f64; 21 | fn "fmod" cpu_fmod(f64, f64) -> f64; 22 | fn "atan2" cpu_atan2(f64, f64) -> f64; 23 | fn "anydsl_isinf" cpu_isinf(f64) -> i32; 24 | fn "anydsl_isnan" cpu_isnan(f64) -> i32; 25 | fn "anydsl_isfinite" cpu_isfinite(f64) -> i32; 26 | } 27 | 28 | extern "device" { 29 | fn "llvm.exp.f32" cpu_expf(f32) -> f32; 30 | fn "llvm.exp2.f32" cpu_exp2f(f32) -> f32; 31 | fn "llvm.log.f32" cpu_logf(f32) -> f32; 32 | fn "llvm.log2.f32" cpu_log2f(f32) -> f32; 33 | fn "llvm.pow.f32" cpu_powf(f32, f32) -> f32; 34 | fn "llvm.sqrt.f32" cpu_sqrtf(f32) -> f32; 35 | fn "llvm.fabs.f32" cpu_fabsf(f32) -> f32; 36 | fn "llvm.sin.f32" cpu_sinf(f32) -> f32; 37 | fn "llvm.cos.f32" cpu_cosf(f32) -> f32; 38 | fn "llvm.floor.f32" cpu_floorf(f32) -> f32; 39 | fn "llvm.fma.f32" cpu_fmaf(f32, f32, f32) -> f32; 40 | fn "llvm.fmuladd.f32" cpu_madf(f32, f32, f32) -> f32; 41 | fn "llvm.copysign.f32" cpu_copysignf(f32, f32) -> f32; 42 | fn "llvm.minnum.f32" cpu_fminf(f32, f32) -> f32; 43 | fn "llvm.maxnum.f32" cpu_fmaxf(f32, f32) -> f32; 44 | fn "llvm.exp.f64" cpu_exp(f64) -> f64; 45 | fn "llvm.exp2.f64" cpu_exp2(f64) -> f64; 46 | fn "llvm.log.f64" cpu_log(f64) -> f64; 47 | fn "llvm.log2.f64" cpu_log2(f64) -> f64; 48 | fn "llvm.pow.f64" cpu_pow(f64, f64) -> f64; 49 | fn "llvm.sqrt.f64" cpu_sqrt(f64) -> f64; 50 | fn "llvm.fabs.f64" cpu_fabs(f64) -> f64; 51 | fn "llvm.sin.f64" cpu_sin(f64) -> f64; 52 | fn "llvm.cos.f64" cpu_cos(f64) -> f64; 53 | fn "llvm.floor.f64" cpu_floor(f64) -> f64; 54 | fn "llvm.fma.f64" cpu_fma(f64, f64, f64) -> f64; 55 | fn "llvm.fmuladd.f64" cpu_mad(f64, f64, f64) -> f64; 56 | fn "llvm.copysign.f64" cpu_copysign(f64, f64) -> f64; 57 | fn "llvm.minnum.f64" cpu_fmin(f64, f64) -> f64; 58 | fn "llvm.maxnum.f64" cpu_fmax(f64, f64) -> f64; 59 | fn "llvm.ctpop.i32" cpu_popcount32(i32) -> i32; 60 | fn "llvm.ctpop.i64" cpu_popcount64(i64) -> i64; 61 | fn "llvm.ctlz.i32" cpu_clz32(i32, bool) -> i32; 62 | fn "llvm.ctlz.i64" cpu_clz64(i64, bool) -> i64; 63 | fn "llvm.cttz.i32" cpu_ctz32(i32, bool) -> i32; 64 | fn "llvm.cttz.i64" cpu_ctz64(i64, bool) -> i64; 65 | fn "llvm.x86.bmi.pext.32" cpu_pext32(i32, i32) -> i32; 66 | fn "llvm.x86.bmi.pext.64" cpu_pext64(i64, i64) -> i64; 67 | fn "llvm.x86.bmi.pdep.32" cpu_pdep32(i32, i32) -> i32; 68 | fn "llvm.x86.bmi.pdep.64" cpu_pdep64(i64, i64) -> i64; 69 | fn "llvm.prefetch.p0" cpu_prefetch(&u8, i32, i32, i32) -> (); 70 | } 71 | 72 | // 73 | // atomics 74 | // 0 1 2 3 4 5 6 7 8 9 10 11 12 75 | // operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub 76 | // 0 1 2 4 5 6 7 77 | // ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent 78 | // syncscope: singlethread "" (system) 79 | // 80 | 81 | fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 { atomic(0u32, a, b, 7u32, "") } 82 | fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32 { atomic(1u32, a, b, 7u32, "") } 83 | fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32 { atomic(2u32, a, b, 7u32, "") } 84 | fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32 { atomic(7u32, a, b, 7u32, "") } 85 | fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32 { atomic(8u32, a, b, 7u32, "") } 86 | 87 | static cpu_intrinsics = Intrinsics { 88 | expf : cpu_expf, 89 | exp2f : cpu_exp2f, 90 | logf : cpu_logf, 91 | log2f : cpu_log2f, 92 | powf : cpu_powf, 93 | rsqrtf : @|a| { 1.0f / cpu_sqrtf(a) }, 94 | sqrtf : cpu_sqrtf, 95 | fabsf : cpu_fabsf, 96 | sinf : cpu_sinf, 97 | cosf : cpu_cosf, 98 | tanf : cpu_tanf, 99 | asinf : cpu_asinf, 100 | acosf : cpu_acosf, 101 | atanf : cpu_atanf, 102 | erff : cpu_erff, 103 | atan2f : cpu_atan2f, 104 | copysignf : cpu_copysignf, 105 | fmaf : cpu_fmaf, 106 | fmaxf : cpu_fmaxf, 107 | fminf : cpu_fminf, 108 | fmodf : cpu_fmodf, 109 | floorf : cpu_floorf, 110 | isinff : cpu_isinff, 111 | isnanf : cpu_isnanf, 112 | isfinitef : cpu_isfinitef, 113 | exp : cpu_exp, 114 | exp2 : cpu_exp2, 115 | log : cpu_log, 116 | log2 : cpu_log2, 117 | pow : cpu_pow, 118 | rsqrt : @|a| { 1.0 / cpu_sqrt(a) }, 119 | sqrt : cpu_sqrt, 120 | fabs : cpu_fabs, 121 | sin : cpu_sin, 122 | cos : cpu_cos, 123 | tan : cpu_tan, 124 | asin : cpu_asin, 125 | acos : cpu_acos, 126 | atan : cpu_atan, 127 | erf : cpu_erf, 128 | atan2 : cpu_atan2, 129 | copysign : cpu_copysign, 130 | fma : cpu_fma, 131 | fmax : cpu_fmax, 132 | fmin : cpu_fmin, 133 | fmod : cpu_fmod, 134 | floor : cpu_floor, 135 | isinf : cpu_isinf, 136 | isnan : cpu_isnan, 137 | isfinite : cpu_isfinite, 138 | min : @|a, b| { if a < b { a } else { b } }, 139 | max : @|a, b| { if a > b { a } else { b } }, 140 | }; 141 | -------------------------------------------------------------------------------- /platforms/impala/intrinsics_hls.impala: -------------------------------------------------------------------------------- 1 | extern "device" { 2 | // no declarations are emitted for "device" functions 3 | fn "exp" hls_expf(f32) -> f32; 4 | fn "exp2" hls_exp2f(f32) -> f32; 5 | fn "log" hls_logf(f32) -> f32; 6 | fn "log2" hls_log2f(f32) -> f32; 7 | fn "pow" hls_powf(f32, f32) -> f32; 8 | fn "rsqrt" hls_rsqrtf(f32) -> f32; 9 | fn "sqrt" hls_sqrtf(f32) -> f32; 10 | fn "fabs" hls_fabsf(f32) -> f32; 11 | fn "sin" hls_sinf(f32) -> f32; 12 | fn "cos" hls_cosf(f32) -> f32; 13 | fn "tan" hls_tanf(f32) -> f32; 14 | fn "asin" hls_asinf(f32) -> f32; 15 | fn "acos" hls_acosf(f32) -> f32; 16 | fn "atan" hls_atanf(f32) -> f32; 17 | fn "erf" hls_erff(f32) -> f32; 18 | fn "atan2" hls_atan2f(f32, f32) -> f32; 19 | fn "fmod" hls_fmodf(f32, f32) -> f32; 20 | fn "floor" hls_floorf(f32) -> f32; 21 | fn "isinf" hls_isinff(f32) -> i32; 22 | fn "isnan" hls_isnanf(f32) -> i32; 23 | fn "isfinite" hls_isfinitef(f32) -> i32; 24 | fn "fma" hls_fmaf(f32, f32, f32) -> f32; 25 | fn "mad" hls_madf(f32, f32, f32) -> f32; 26 | fn "copysign" hls_copysignf(f32, f32) -> f32; 27 | fn "exp" hls_exp(f64) -> f64; 28 | fn "exp2" hls_exp2(f64) -> f64; 29 | fn "log" hls_log(f64) -> f64; 30 | fn "log2" hls_log2(f64) -> f64; 31 | fn "pow" hls_pow(f64, f64) -> f64; 32 | fn "rsqrt" hls_rsqrt(f64) -> f64; 33 | fn "sqrt" hls_sqrt(f64) -> f64; 34 | fn "fabs" hls_fabs(f64) -> f64; 35 | fn "sin" hls_sin(f64) -> f64; 36 | fn "cos" hls_cos(f64) -> f64; 37 | fn "tan" hls_tan(f64) -> f64; 38 | fn "asin" hls_asin(f64) -> f64; 39 | fn "acos" hls_acos(f64) -> f64; 40 | fn "atan" hls_atan(f64) -> f64; 41 | fn "erf" hls_erf(f64) -> f64; 42 | fn "atan2" hls_atan2(f64, f64) -> f64; 43 | fn "fmod" hls_fmod(f64, f64) -> f64; 44 | fn "floor" hls_floor(f64) -> f64; 45 | fn "isinf" hls_isinf(f64) -> i32; 46 | fn "isnan" hls_isnan(f64) -> i32; 47 | fn "isfinite" hls_isfinite(f64) -> i32; 48 | fn "fma" hls_fma(f64, f64, f64) -> f64; 49 | fn "mad" hls_mad(f64, f64, f64) -> f64; 50 | fn "copysign" hls_copysign(f64, f64) -> f64; 51 | fn "fmin" hls_fminf(f32, f32) -> f32; 52 | fn "fmax" hls_fmaxf(f32, f32) -> f32; 53 | fn "fmin" hls_fmin(f64, f64) -> f64; 54 | fn "fmax" hls_fmax(f64, f64) -> f64; 55 | fn "min" hls_min(i32, i32) -> i32; 56 | fn "max" hls_max(i32, i32) -> i32; 57 | } 58 | 59 | // channel scalar types 60 | struct channel_u8 { data : u8 }; 61 | struct channel_i32 { data : i32 }; 62 | struct channel_f32 { data : f32 }; 63 | 64 | // channel array types 65 | struct channel1_u8 { data : [u8 * 1 ] }; 66 | struct channel2_u8 { data : [u8 * 2 ] }; 67 | struct channel4_u8 { data : [u8 * 4 ] }; 68 | struct channel8_u8 { data : [u8 * 8 ] }; 69 | struct channel16_u8 { data : [u8 * 16 ] }; 70 | struct channel32_u8 { data : [u8 * 32 ] }; 71 | struct channel64_u8 { data : [u8 * 64 ] }; 72 | struct channel128_u8 { data : [u8 * 128] }; 73 | 74 | struct channel1_i32 { data : [i32 * 1 ] }; 75 | struct channel2_i32 { data : [i32 * 2 ] }; 76 | struct channel4_i32 { data : [i32 * 4 ] }; 77 | struct channel8_i32 { data : [i32 * 8 ] }; 78 | struct channel16_i32 { data : [i32 * 16 ] }; 79 | struct channel32_i32 { data : [i32 * 32 ] }; 80 | struct channel64_i32 { data : [i32 * 64 ] }; 81 | struct channel128_i32 { data : [i32 * 128] }; 82 | 83 | struct channel1_f32 { data : [f32 * 1 ] }; 84 | struct channel2_f32 { data : [f32 * 2 ] }; 85 | struct channel4_f32 { data : [f32 * 4 ] }; 86 | struct channel8_f32 { data : [f32 * 8 ] }; 87 | struct channel16_f32 { data : [f32 * 16 ] }; 88 | struct channel32_f32 { data : [f32 * 32 ] }; 89 | struct channel64_f32 { data : [f32 * 64 ] }; 90 | struct channel128_f32 { data : [f32 * 128] }; 91 | 92 | extern "device" { 93 | fn print_pragma(&[u8]) -> (); 94 | // u8 scalar 95 | fn "read_channel" read_channel_u8 ( &mut channel_u8 ) -> u8 ; 96 | fn "write_channel" write_channel_u8 ( &mut channel_u8, u8) -> (); 97 | 98 | // u8 array 99 | fn "read_channel" read_channel1_u8 ( &mut channel1_u8 ) -> [u8 * 1 ]; 100 | fn "read_channel" read_channel2_u8 ( &mut channel2_u8 ) -> [u8 * 2 ]; 101 | fn "read_channel" read_channel4_u8 ( &mut channel4_u8 ) -> [u8 * 4 ]; 102 | fn "read_channel" read_channel8_u8 ( &mut channel8_u8 ) -> [u8 * 8 ]; 103 | fn "read_channel" read_channel16_u8 ( &mut channel16_u8 ) -> [u8 * 16 ]; 104 | fn "read_channel" read_channel32_u8 ( &mut channel32_u8 ) -> [u8 * 32 ]; 105 | fn "read_channel" read_channel64_u8 ( &mut channel64_u8 ) -> [u8 * 64 ]; 106 | fn "read_channel" read_channel128_u8( &mut channel128_u8) -> [u8 * 128]; 107 | 108 | fn "write_channel" write_channel1_u8 ( &mut channel1_u8, [u8 * 1 ] ) -> (); 109 | fn "write_channel" write_channel2_u8 ( &mut channel2_u8, [u8 * 2 ] ) -> (); 110 | fn "write_channel" write_channel4_u8 ( &mut channel4_u8, [u8 * 4 ] ) -> (); 111 | fn "write_channel" write_channel8_u8 ( &mut channel8_u8, [u8 * 8 ] ) -> (); 112 | fn "write_channel" write_channel16_u8 ( &mut channel16_u8, [u8 * 16 ] ) -> (); 113 | fn "write_channel" write_channel32_u8 ( &mut channel32_u8, [u8 * 32 ] ) -> (); 114 | fn "write_channel" write_channel64_u8 ( &mut channel64_u8, [u8 * 64 ] ) -> (); 115 | fn "write_channel" write_channel128_u8( &mut channel128_u8, [u8 * 128] ) -> (); 116 | fn " " bitcast_channel_u8 ( &mut channel1_u8) -> [u8 * 2 ]; 117 | 118 | // i32 scalar 119 | fn "read_channel" read_channel_i32 ( &mut channel_i32 ) -> i32; 120 | fn "write_channel" write_channel_i32 ( &mut channel_i32, i32 ) -> (); 121 | 122 | // i32 array 123 | fn "read_channel" read_channel1_i32 ( &mut channel1_i32 ) -> [i32 * 1 ]; 124 | fn "read_channel" read_channel2_i32 ( &mut channel2_i32 ) -> [i32 * 2 ]; 125 | fn "read_channel" read_channel4_i32 ( &mut channel4_i32 ) -> [i32 * 4 ]; 126 | fn "read_channel" read_channel8_i32 ( &mut channel8_i32 ) -> [i32 * 8 ]; 127 | fn "read_channel" read_channel16_i32 ( &mut channel16_i32 ) -> [i32 * 16 ]; 128 | fn "read_channel" read_channel32_i32 ( &mut channel32_i32 ) -> [i32 * 32 ]; 129 | fn "read_channel" read_channel64_i32 ( &mut channel64_i32 ) -> [i32 * 64 ]; 130 | fn "read_channel" read_channel128_i32( &mut channel128_i32) -> [i32 * 128]; 131 | 132 | fn "write_channel" write_channel1_i32 ( &mut channel1_i32, [i32 * 1 ] )-> (); 133 | fn "write_channel" write_channel2_i32 ( &mut channel2_i32, [i32 * 2 ] ) -> (); 134 | fn "write_channel" write_channel4_i32 ( &mut channel4_i32, [i32 * 4 ] ) -> (); 135 | fn "write_channel" write_channel8_i32 ( &mut channel8_i32, [i32 * 8 ] ) -> (); 136 | fn "write_channel" write_channel16_i32 ( &mut channel16_i32, [i32 * 16 ] ) -> (); 137 | fn "write_channel" write_channel32_i32 ( &mut channel32_i32, [i32 * 32 ] ) -> (); 138 | fn "write_channel" write_channel64_i32 ( &mut channel64_i32, [i32 * 64 ] ) -> (); 139 | fn "write_channel" write_channel128_i32( &mut channel128_i32, [i32 * 128]) -> (); 140 | fn " " bitcast_channel_i32 ( &mut channel1_i32) -> [i32 * 2 ]; 141 | 142 | // f32 scalar 143 | fn "read_channel" read_channel_f32 ( &mut channel_f32 ) -> f32; 144 | fn "write_channel" write_channel_f32 ( &mut channel_f32, f32 ) -> (); 145 | 146 | // f32 array 147 | fn "read_channel" read_channel1_f32 ( &mut channel1_f32 ) -> [f32 * 1 ]; 148 | fn "read_channel" read_channel2_f32 ( &mut channel2_f32 ) -> [f32 * 2 ]; 149 | fn "read_channel" read_channel4_f32 ( &mut channel4_f32 ) -> [f32 * 4 ]; 150 | fn "read_channel" read_channel8_f32 ( &mut channel8_f32 ) -> [f32 * 8 ]; 151 | fn "read_channel" read_channel16_f32 ( &mut channel16_f32 ) -> [f32 * 16 ]; 152 | fn "read_channel" read_channel32_f32 ( &mut channel32_f32 ) -> [f32 * 32 ]; 153 | fn "read_channel" read_channel64_f32 ( &mut channel64_f32 ) -> [f32 * 64 ]; 154 | fn "read_channel" read_channel128_f32( &mut channel128_f32) -> [f32 * 128]; 155 | 156 | fn "write_channel" write_channel1_f32 ( &mut channel1_f32, [f32 * 1 ]) -> (); 157 | fn "write_channel" write_channel2_f32 ( &mut channel2_f32, [f32 * 2 ]) -> (); 158 | fn "write_channel" write_channel4_f32 ( &mut channel4_f32, [f32 * 4 ]) -> (); 159 | fn "write_channel" write_channel8_f32 ( &mut channel8_f32, [f32 * 8 ]) -> (); 160 | fn "write_channel" write_channel16_f32 ( &mut channel16_f32, [f32 * 16 ]) -> (); 161 | fn "write_channel" write_channel32_f32 ( &mut channel32_f32, [f32 * 32 ]) -> (); 162 | fn "write_channel" write_channel64_f32 ( &mut channel64_f32, [f32 * 64 ]) -> (); 163 | fn "write_channel" write_channel128_f32( &mut channel128_f32, [f32 * 128]) -> (); 164 | fn " " bitcast_channel_f32 ( &mut channel1_f32) -> [f32 * 2 ]; 165 | } 166 | 167 | fn @hls_accelerator(dev: i32) -> Accelerator { 168 | Accelerator { 169 | exec : @|grid, block, body| { 170 | let work_item = WorkItem { 171 | tidx : @|| 0, tidy : @|| 0, tidz : @|| 0, 172 | bidx : @|| 0, bidy : @|| 0, bidz : @|| 0, 173 | gidx : @|| 0, gidy : @|| 0, gidz : @|| 0, 174 | bdimx : @|| 1, bdimy : @|| 1, bdimz : @|| 1, 175 | gdimx : @|| 1, gdimy : @|| 1, gdimz : @|| 1, 176 | nblkx : @|| 1, nblky : @|| 1, nblkz : @|| 1 177 | }; 178 | hls(dev, || @@body(work_item)); 179 | }, 180 | sync : @|| synchronize_hls(dev), 181 | alloc : @|size| alloc_hls(dev, size), 182 | alloc_unified : @|size| alloc_hls_unified(dev, size), 183 | barrier : @|| () 184 | } 185 | }; 186 | 187 | static hls_intrinsics = Intrinsics { 188 | expf : hls_expf, 189 | exp2f : hls_exp2f, 190 | logf : hls_logf, 191 | log2f : hls_log2f, 192 | powf : hls_powf, 193 | rsqrtf : hls_rsqrtf, 194 | sqrtf : hls_sqrtf, 195 | fabsf : hls_fabsf, 196 | sinf : hls_sinf, 197 | cosf : hls_cosf, 198 | tanf : hls_tanf, 199 | asinf : hls_asinf, 200 | acosf : hls_acosf, 201 | atanf : hls_atanf, 202 | erff : hls_erff, 203 | atan2f : hls_atan2f, 204 | copysignf : hls_copysignf, 205 | fmaf : hls_fmaf, 206 | fmaxf : hls_fmaxf, 207 | fminf : hls_fminf, 208 | fmodf : hls_fmodf, 209 | floorf : hls_floorf, 210 | isinff : hls_isinff, 211 | isnanf : hls_isnanf, 212 | isfinitef : hls_isfinitef, 213 | exp : hls_exp, 214 | exp2 : hls_exp2, 215 | log : hls_log, 216 | log2 : hls_log2, 217 | pow : hls_pow, 218 | rsqrt : hls_rsqrt, 219 | sqrt : hls_sqrt, 220 | fabs : hls_fabs, 221 | sin : hls_sin, 222 | cos : hls_cos, 223 | tan : hls_tan, 224 | asin : hls_asin, 225 | acos : hls_acos, 226 | atan : hls_atan, 227 | erf : hls_erf, 228 | atan2 : hls_atan2, 229 | copysign : hls_copysign, 230 | fma : hls_fma, 231 | fmax : hls_fmax, 232 | fmin : hls_fmin, 233 | fmod : hls_fmod, 234 | floor : hls_floor, 235 | isinf : hls_isinf, 236 | isnan : hls_isnan, 237 | isfinite : hls_isfinite, 238 | min : hls_min, 239 | max : hls_max, 240 | }; 241 | -------------------------------------------------------------------------------- /platforms/impala/intrinsics_opencl.impala: -------------------------------------------------------------------------------- 1 | extern "device" { 2 | // no declarations are emitted for "device" functions 3 | fn "barrier" opencl_barrier(u32) -> (); 4 | fn "exp" opencl_expf(f32) -> f32; 5 | fn "exp2" opencl_exp2f(f32) -> f32; 6 | fn "log" opencl_logf(f32) -> f32; 7 | fn "log2" opencl_log2f(f32) -> f32; 8 | fn "pow" opencl_powf(f32, f32) -> f32; 9 | fn "rsqrt" opencl_rsqrtf(f32) -> f32; 10 | fn "sqrt" opencl_sqrtf(f32) -> f32; 11 | fn "fabs" opencl_fabsf(f32) -> f32; 12 | fn "sin" opencl_sinf(f32) -> f32; 13 | fn "cos" opencl_cosf(f32) -> f32; 14 | fn "tan" opencl_tanf(f32) -> f32; 15 | fn "asin" opencl_asinf(f32) -> f32; 16 | fn "acos" opencl_acosf(f32) -> f32; 17 | fn "atan" opencl_atanf(f32) -> f32; 18 | fn "erf" opencl_erff(f32) -> f32; 19 | fn "atan2" opencl_atan2f(f32, f32) -> f32; 20 | fn "fmod" opencl_fmodf(f32, f32) -> f32; 21 | fn "floor" opencl_floorf(f32) -> f32; 22 | fn "isinf" opencl_isinff(f32) -> i32; 23 | fn "isnan" opencl_isnanf(f32) -> i32; 24 | fn "isfinite" opencl_isfinitef(f32) -> i32; 25 | fn "fma" opencl_fmaf(f32, f32, f32) -> f32; 26 | fn "mad" opencl_madf(f32, f32, f32) -> f32; 27 | fn "copysign" opencl_copysignf(f32, f32) -> f32; 28 | fn "exp" opencl_exp(f64) -> f64; 29 | fn "exp2" opencl_exp2(f64) -> f64; 30 | fn "log" opencl_log(f64) -> f64; 31 | fn "log2" opencl_log2(f64) -> f64; 32 | fn "pow" opencl_pow(f64, f64) -> f64; 33 | fn "rsqrt" opencl_rsqrt(f64) -> f64; 34 | fn "sqrt" opencl_sqrt(f64) -> f64; 35 | fn "fabs" opencl_fabs(f64) -> f64; 36 | fn "sin" opencl_sin(f64) -> f64; 37 | fn "cos" opencl_cos(f64) -> f64; 38 | fn "tan" opencl_tan(f64) -> f64; 39 | fn "asin" opencl_asin(f64) -> f64; 40 | fn "acos" opencl_acos(f64) -> f64; 41 | fn "atan" opencl_atan(f64) -> f64; 42 | fn "erf" opencl_erf(f64) -> f64; 43 | fn "atan2" opencl_atan2(f64, f64) -> f64; 44 | fn "fmod" opencl_fmod(f64, f64) -> f64; 45 | fn "floor" opencl_floor(f64) -> f64; 46 | fn "isinf" opencl_isinf(f64) -> i32; 47 | fn "isnan" opencl_isnan(f64) -> i32; 48 | fn "isfinite" opencl_isfinite(f64) -> i32; 49 | fn "fma" opencl_fma(f64, f64, f64) -> f64; 50 | fn "mad" opencl_mad(f64, f64, f64) -> f64; 51 | fn "copysign" opencl_copysign(f64, f64) -> f64; 52 | fn "fmin" opencl_fminf(f32, f32) -> f32; 53 | fn "fmax" opencl_fmaxf(f32, f32) -> f32; 54 | fn "fmin" opencl_fmin(f64, f64) -> f64; 55 | fn "fmax" opencl_fmax(f64, f64) -> f64; 56 | fn "min" opencl_min(i32, i32) -> i32; 57 | fn "max" opencl_max(i32, i32) -> i32; 58 | fn "atomic_add" opencl_atomic_add_global(&mut[1]i32, i32) -> i32; 59 | fn "atomic_add" opencl_atomic_add_shared(&mut[3]i32, i32) -> i32; 60 | fn "atomic_min" opencl_atomic_min_global(&mut[1]i32, i32) -> i32; 61 | fn "atomic_min" opencl_atomic_min_shared(&mut[3]i32, i32) -> i32; 62 | fn "get_work_dim" opencl_get_work_dim() -> u32; 63 | fn "get_global_size" opencl_get_global_size(u32) -> u64; 64 | fn "get_global_id" opencl_get_global_id(u32) -> u64; 65 | fn "get_local_size" opencl_get_local_size(u32) -> u64; 66 | fn "get_local_id" opencl_get_local_id(u32) -> u64; 67 | fn "get_num_groups" opencl_get_num_groups(u32) -> u64; 68 | fn "get_group_id" opencl_get_group_id(u32) -> u64; 69 | fn "get_global_offset" opencl_get_global_offset(u32) -> u64; 70 | } 71 | 72 | fn @opencl_accelerator(dev: i32) -> Accelerator { 73 | Accelerator { 74 | exec : @|grid, block, body| { 75 | let work_item = WorkItem { 76 | tidx : @|| opencl_get_local_id(0u32) as i32, 77 | tidy : @|| opencl_get_local_id(1u32) as i32, 78 | tidz : @|| opencl_get_local_id(2u32) as i32, 79 | bidx : @|| opencl_get_group_id(0u32) as i32, 80 | bidy : @|| opencl_get_group_id(1u32) as i32, 81 | bidz : @|| opencl_get_group_id(2u32) as i32, 82 | gidx : @|| opencl_get_global_id(0u32) as i32, 83 | gidy : @|| opencl_get_global_id(1u32) as i32, 84 | gidz : @|| opencl_get_global_id(2u32) as i32, 85 | bdimx : @|| opencl_get_local_size(0u32) as i32, 86 | bdimy : @|| opencl_get_local_size(1u32) as i32, 87 | bdimz : @|| opencl_get_local_size(2u32) as i32, 88 | gdimx : @|| opencl_get_global_size(0u32) as i32, 89 | gdimy : @|| opencl_get_global_size(1u32) as i32, 90 | gdimz : @|| opencl_get_global_size(2u32) as i32, 91 | nblkx : @|| opencl_get_num_groups(0u32) as i32, 92 | nblky : @|| opencl_get_num_groups(1u32) as i32, 93 | nblkz : @|| opencl_get_num_groups(2u32) as i32 94 | }; 95 | opencl(dev, grid, block, || @@body(work_item)) 96 | }, 97 | sync : @|| synchronize_opencl(dev), 98 | alloc : @|size| alloc_opencl(dev, size), 99 | alloc_unified : @|size| alloc_opencl_unified(dev, size), 100 | barrier : @|| opencl_barrier(1u32), // CLK_LOCAL_MEM_FENCE -> 1 // CLK_GLOBAL_MEM_FENCE -> 2 101 | } 102 | } 103 | 104 | static opencl_intrinsics = Intrinsics { 105 | expf : opencl_expf, 106 | exp2f : opencl_exp2f, 107 | logf : opencl_logf, 108 | log2f : opencl_log2f, 109 | powf : opencl_powf, 110 | rsqrtf : opencl_rsqrtf, 111 | sqrtf : opencl_sqrtf, 112 | fabsf : opencl_fabsf, 113 | sinf : opencl_sinf, 114 | cosf : opencl_cosf, 115 | tanf : opencl_tanf, 116 | asinf : opencl_asinf, 117 | acosf : opencl_acosf, 118 | atanf : opencl_atanf, 119 | erff : opencl_erff, 120 | atan2f : opencl_atan2f, 121 | copysignf : opencl_copysignf, 122 | fmaf : opencl_fmaf, 123 | fmaxf : opencl_fmaxf, 124 | fminf : opencl_fminf, 125 | fmodf : opencl_fmodf, 126 | floorf : opencl_floorf, 127 | isinff : opencl_isinff, 128 | isnanf : opencl_isnanf, 129 | isfinitef : opencl_isfinitef, 130 | exp : opencl_exp, 131 | exp2 : opencl_exp2, 132 | log : opencl_log, 133 | log2 : opencl_log2, 134 | pow : opencl_pow, 135 | rsqrt : opencl_rsqrt, 136 | sqrt : opencl_sqrt, 137 | fabs : opencl_fabs, 138 | sin : opencl_sin, 139 | cos : opencl_cos, 140 | tan : opencl_tan, 141 | asin : opencl_asin, 142 | acos : opencl_acos, 143 | atan : opencl_atan, 144 | erf : opencl_erf, 145 | atan2 : opencl_atan2, 146 | copysign : opencl_copysign, 147 | fma : opencl_fma, 148 | fmax : opencl_fmax, 149 | fmin : opencl_fmin, 150 | fmod : opencl_fmod, 151 | floor : opencl_floor, 152 | isinf : opencl_isinf, 153 | isnan : opencl_isnan, 154 | isfinite : opencl_isfinite, 155 | min : opencl_min, 156 | max : opencl_max, 157 | }; 158 | -------------------------------------------------------------------------------- /platforms/impala/intrinsics_rv.impala: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | fn rv_mask() -> bool; 3 | fn rv_any(bool) -> bool; 4 | fn rv_all(bool) -> bool; 5 | fn rv_ballot(bool) -> i32; 6 | fn rv_extract(f32, i32) -> f32; 7 | fn rv_insert(f32, i32, f32) -> f32; 8 | fn rv_load(&f32, i32) -> f32; 9 | fn rv_store(&mut f32, i32, f32) -> (); 10 | fn rv_shuffle(f32, i32) -> f32; 11 | fn rv_align(&i8, i32)-> &i8; 12 | fn rv_compact(f32, bool) -> f32; 13 | fn rv_lane_id() -> i32; 14 | fn rv_num_lanes() -> i32; 15 | } 16 | -------------------------------------------------------------------------------- /platforms/impala/intrinsics_thorin.impala: -------------------------------------------------------------------------------- 1 | extern "thorin" { 2 | fn pe_info[T](&[u8], T) -> (); 3 | 4 | fn alignof[T]() -> i64; 5 | fn sizeof[T]() -> i64; 6 | fn undef[T]() -> T; 7 | 8 | fn bitcast[D, S](S) -> D; 9 | fn select[T, U](T, U, U) -> U; 10 | fn insert[T, U](T, i32, U) -> T; 11 | //fn shuffle[T](T, T, T) -> T; 12 | 13 | fn cuda(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); 14 | fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); 15 | fn opencl(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); 16 | fn amdgpu_hsa(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); 17 | fn amdgpu_pal(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); 18 | fn reserve_shared[T](i32) -> &mut[3][T]; 19 | 20 | fn hls(dev: i32, body: fn() -> ()) -> (); 21 | fn pipeline(i32, i32, i32, fn(i32) -> ()) -> (); // only for HLS/OpenCL backend 22 | fn parallel(num_threads: i32, lower: i32, upper: i32, body: fn(i32) -> ()) -> (); 23 | fn spawn(body: fn() -> ()) -> i32; 24 | fn sync(id: i32) -> (); 25 | 26 | fn atomic[T](binop: u32, addr: &mut T, val: T, order: u32, scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub 27 | fn atomic_load[T](addr: &T, order: u32, scope: &[u8]) -> T; 28 | fn atomic_store[T](addr: &mut T, val: T, order: u32, scope: &[u8]) -> (); 29 | fn cmpxchg[T](addr: &mut T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); // only for integer data types 30 | fn cmpxchg_weak[T](addr: &mut T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); // only for integer data types 31 | fn fence(order: u32, scope: &[u8]) -> (); 32 | 33 | fn "atomic" atomic_p1[T](binop: u32, addr: &mut [1]T, val: T, order: u32, scope: &[u8]) -> T; 34 | fn "atomic" atomic_p3[T](binop: u32, addr: &mut [3]T, val: T, order: u32, scope: &[u8]) -> T; 35 | fn "atomic_load" atomic_load_p1[T](addr: &[1]T, order: u32, scope: &[u8]) -> T; 36 | fn "atomic_load" atomic_load_p3[T](addr: &[3]T, order: u32, scope: &[u8]) -> T; 37 | fn "atomic_store" atomic_store_p1[T](addr: &mut [1]T, val: T, order: u32, scope: &[u8]) -> (); 38 | fn "atomic_store" atomic_store_p3[T](addr: &mut [3]T, val: T, order: u32, scope: &[u8]) -> (); 39 | fn "cmpxchg" cmpxchg_p1[T](addr: &mut [1]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); 40 | fn "cmpxchg" cmpxchg_p3[T](addr: &mut [3]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); 41 | fn "cmpxchg_weak" cmpxchg_weak_p1[T](addr: &mut [1]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); 42 | fn "cmpxchg_weak" cmpxchg_weak_p3[T](addr: &mut [3]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); 43 | 44 | fn vectorize(vector_length: i32, body: fn(i32) -> ()) -> (); 45 | } 46 | -------------------------------------------------------------------------------- /platforms/impala/runtime.impala: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | fn "anydsl_info" runtime_info() -> (); 3 | fn "anydsl_device_name" runtime_device_name(_device: i32) -> &[u8]; 4 | fn "anydsl_device_check_feature_support" runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool; 5 | 6 | fn "anydsl_alloc" runtime_alloc(i32, i64) -> &[i8]; 7 | fn "anydsl_alloc_host" runtime_alloc_host(i32, i64) -> &[i8]; 8 | fn "anydsl_alloc_unified" runtime_alloc_unified(i32, i64) -> &[i8]; 9 | fn "anydsl_copy" runtime_copy(i32, &[i8], i64, i32, &[i8], i64, i64) -> (); 10 | fn "anydsl_get_device_ptr" runtime_get_device_ptr(i32, &[i8]) -> &[i8]; 11 | fn "anydsl_release" runtime_release(i32, &[i8]) -> (); 12 | fn "anydsl_release_host" runtime_release_host(i32, &[i8]) -> (); 13 | fn "anydsl_synchronize" runtime_synchronize(i32) -> (); 14 | 15 | fn "anydsl_random_seed" random_seed(u32) -> (); 16 | fn "anydsl_random_val_f32" random_val_f32() -> f32; 17 | fn "anydsl_random_val_u64" random_val_u64() -> u64; 18 | 19 | fn "anydsl_get_micro_time" get_micro_time() -> i64; 20 | fn "anydsl_get_nano_time" get_nano_time() -> i64; 21 | fn "anydsl_get_kernel_time" get_kernel_time() -> i64; 22 | 23 | fn "anydsl_print_i16" print_i16(i16) -> (); 24 | fn "anydsl_print_i32" print_i32(i32) -> (); 25 | fn "anydsl_print_i64" print_i64(i64) -> (); 26 | fn "anydsl_print_u16" print_u16(u16) -> (); 27 | fn "anydsl_print_u32" print_u32(u32) -> (); 28 | fn "anydsl_print_u64" print_u64(u64) -> (); 29 | fn "anydsl_print_f32" print_f32(f32) -> (); 30 | fn "anydsl_print_f64" print_f64(f64) -> (); 31 | fn "anydsl_print_char" print_char(u8) -> (); 32 | fn "anydsl_print_string" print_string(&[u8]) -> (); 33 | fn "anydsl_print_flush" print_flush() -> (); 34 | } 35 | 36 | struct Buffer { 37 | data : &[i8], 38 | size : i64, 39 | device : i32 40 | } 41 | 42 | fn @alloc(dev: i32, size: i64) -> Buffer { 43 | Buffer { 44 | device : dev, 45 | data : runtime_alloc(dev, size), 46 | size : size 47 | } 48 | } 49 | fn @alloc_host(dev: i32, size: i64) -> Buffer { 50 | Buffer { 51 | device : dev, 52 | data : runtime_alloc_host(dev, size), 53 | size : size 54 | } 55 | } 56 | fn @alloc_unified(dev: i32, size: i64) -> Buffer { 57 | Buffer { 58 | device : dev, 59 | data : runtime_alloc_unified(dev, size), 60 | size : size 61 | } 62 | } 63 | fn @release(buf: Buffer) -> () { runtime_release(buf.device, buf.data) } 64 | 65 | fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) } 66 | 67 | fn @alloc_cpu(size: i64) -> Buffer { alloc(0, size) } 68 | fn @alloc_cuda(dev: i32, size: i64) -> Buffer { alloc(runtime_device(1, dev), size) } 69 | fn @alloc_cuda_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(1, dev), size) } 70 | fn @alloc_cuda_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(1, dev), size) } 71 | fn @synchronize_cuda(dev: i32) -> () { runtime_synchronize(runtime_device(1, dev)) } 72 | fn @alloc_opencl(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) } 73 | fn @alloc_opencl_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) } 74 | fn @synchronize_opencl(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) } 75 | fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) } 76 | fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) } 77 | fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) } 78 | fn @alloc_hsa(dev: i32, size: i64) -> Buffer { alloc(runtime_device(3, dev), size) } 79 | fn @alloc_hsa_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(3, dev), size) } 80 | fn @alloc_hsa_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(3, dev), size) } 81 | fn @synchronize_hsa(dev: i32) -> () { runtime_synchronize(runtime_device(3, dev)) } 82 | fn @alloc_pal(dev: i32, size: i64) -> Buffer { alloc(runtime_device(4, dev), size) } 83 | fn @alloc_pal_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(4, dev), size) } 84 | fn @alloc_pal_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(4, dev), size) } 85 | fn @synchronize_pal(dev: i32) -> () { runtime_synchronize(runtime_device(4, dev)) } 86 | 87 | fn @copy(src: Buffer, dst: Buffer) -> () { 88 | runtime_copy(src.device, src.data, 0i64, dst.device, dst.data, 0i64, src.size) 89 | } 90 | 91 | fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) -> () { 92 | runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size) 93 | } 94 | 95 | 96 | // range, range_step, unroll, unroll_step, etc. 97 | fn @(?lower & ?upper & ?step) unroll_step(lower: i32, upper: i32, @step: i32, body: fn(i32) -> ()) -> () { 98 | if lower < upper { 99 | @@body(lower); 100 | unroll_step(lower+step, upper, step, body) 101 | } 102 | } 103 | 104 | fn @(?upper & ?lower & ?step) unroll_step_rev(upper: i32, lower: i32, @step: i32, body: fn(i32) -> ()) -> () { 105 | if upper > lower { 106 | @@body(upper); 107 | unroll_step_rev(upper-step, lower, step, body) 108 | } 109 | } 110 | 111 | fn @range(lower: i32, upper: i32, body: fn(i32) -> ()) -> () { unroll_step($lower, $upper, 1, body) } 112 | fn @range_step(lower: i32, upper: i32, step: i32, body: fn(i32) -> ()) -> () { unroll_step($lower, $upper, step, body) } 113 | fn @range_rev(upper: i32, lower: i32, body: fn(i32) -> ()) -> () { unroll_step_rev($upper, $lower, 1, body) } 114 | 115 | fn @unroll(lower: i32, upper: i32, body: fn(i32) -> ()) -> () { unroll_step(lower, upper, 1, body) } 116 | fn @unroll_rev(upper: i32, lower: i32, body: fn(i32) -> ()) -> () { unroll_step_rev(upper, lower, 1, body) } 117 | -------------------------------------------------------------------------------- /post-patcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys, re, os 3 | basename = sys.argv[1] 4 | def patch_llvmir(rttype): 5 | # we need to patch 6 | result = [] 7 | filename = basename+"."+rttype 8 | if os.path.isfile(filename): 9 | with open(filename) as f: 10 | for line in f: 11 | if rttype=="amdgpu" or rttype=="nvvm" or rttype=="ll": 12 | # patch to opaque identity functions 13 | m = re.match(r'^declare (.*) @(magic_.*_id)\((.*)\) (?:local_)?unnamed_addr(?: #[0-9]+)?\n$', line) 14 | if m is not None: 15 | ty1, fname, ty2 = m.groups() 16 | assert ty1 == ty2, "Argument and return types of magic IDs must match" 17 | print("Patching magic ID {0} in {1}".format(fname, filename)) 18 | # emit definition instead 19 | result.append('define {0} @{1}({0} %name) {{\n'.format(ty1, fname)) 20 | result.append(' ret {0} %name\n'.format(ty1)) 21 | result.append('}\n') 22 | continue 23 | 24 | result.append(line) 25 | # we have the patched thing, write it 26 | with open(filename, "w") as f: 27 | for line in result: 28 | f.write(line) 29 | return 30 | 31 | def patch_cfiles(rttype): 32 | # we need to patch 33 | channel_line = {} 34 | channel_type = {} 35 | result = [] 36 | channel_decl_name = None 37 | channel_decl_type = None 38 | channel_decl_line = 0 39 | if rttype == "cuda": 40 | filename = basename+"."+"cu" 41 | elif rttype == "opencl": 42 | filename = basename+"."+"cl" 43 | elif rttype == "hls": 44 | filename = basename+"."+"hls" 45 | 46 | if os.path.isfile(filename): 47 | with open(filename) as f: 48 | for line in f: 49 | # patch to opaque identity functions 50 | m = re.match(r'^(.*) = (magic_.*_id)\((.*)\);\n$', line) 51 | if m is not None: 52 | lhs, fname, arg = m.groups() 53 | print("Patching magic ID {0} in {1}".format(fname, filename)) 54 | # emit definition instead 55 | result.append('{0} = {1};\n'.format(lhs, arg)) 56 | else: 57 | result.append(line) 58 | 59 | # we have the patched thing, write it 60 | with open(filename, "w") as f: 61 | for line in result: 62 | f.write(line) 63 | return 64 | 65 | def patch_defs(rttype): 66 | nvvm_defs = { 67 | } 68 | 69 | if rttype == "nvvm": 70 | result = [] 71 | filename = basename+".nvvm" 72 | if os.path.isfile(filename): 73 | with open(filename) as f: 74 | for line in f: 75 | matched = False 76 | 77 | for (func, code) in iter(nvvm_defs.items()): 78 | m = re.match(r'^declare (.*) (@' + func + r')\((.*)\)\n$', line) 79 | if m is not None: 80 | result.append(code) 81 | matched = True 82 | break 83 | 84 | if not matched: 85 | result.append(line) 86 | 87 | with open(filename, "w") as f: 88 | for line in result: 89 | f.write(line) 90 | return 91 | 92 | patch_llvmir("ll") 93 | patch_llvmir("amdgpu") 94 | patch_llvmir("nvvm") 95 | patch_cfiles("cuda") 96 | patch_cfiles("opencl") 97 | patch_cfiles("hls") 98 | patch_defs("nvvm") 99 | -------------------------------------------------------------------------------- /src/anydsl_jit.h: -------------------------------------------------------------------------------- 1 | #ifndef ANYDSL_JIT_H 2 | #define ANYDSL_JIT_H 3 | 4 | #include 5 | 6 | #include "anydsl_runtime_config.h" 7 | 8 | class Runtime; 9 | 10 | AnyDSL_runtime_API Runtime& runtime(); 11 | 12 | #ifdef AnyDSL_runtime_HAS_JIT_SUPPORT 13 | AnyDSL_runtime_jit_API void anydsl_set_cache_directory(const char*); 14 | AnyDSL_runtime_jit_API const char* anydsl_get_cache_directory(); 15 | AnyDSL_runtime_jit_API void anydsl_link(const char*); 16 | AnyDSL_runtime_jit_API int32_t anydsl_compile(const char*, uint32_t, uint32_t); 17 | AnyDSL_runtime_jit_API void *anydsl_lookup_function(int32_t, const char*); 18 | AnyDSL_runtime_jit_API void anydsl_set_log_level(uint32_t /* log level (4=error only, 3=warn, 2=info, 1=verbose, 0=debug) */); 19 | #endif 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/anydsl_runtime.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "anydsl_runtime.h" 8 | // Make sure the definition for runtime() matches 9 | // the declaration in anydsl_jit.h 10 | #include "anydsl_jit.h" 11 | 12 | #include "runtime.h" 13 | #include "platform.h" 14 | #include "dummy_platform.h" 15 | #include "cpu_platform.h" 16 | 17 | #ifdef AnyDSL_runtime_HAS_TBB_SUPPORT 18 | #define NOMINMAX 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #else 25 | #include 26 | #endif 27 | 28 | struct RuntimeSingleton { 29 | Runtime runtime; 30 | 31 | RuntimeSingleton() 32 | : runtime(detect_profile_level()) 33 | { 34 | runtime.register_platform(); 35 | register_cuda_platform(&runtime); 36 | register_opencl_platform(&runtime); 37 | register_hsa_platform(&runtime); 38 | register_pal_platform(&runtime); 39 | register_levelzero_platform(&runtime); 40 | } 41 | 42 | static std::pair detect_profile_level() { 43 | auto profile = std::make_pair(ProfileLevel::None, ProfileLevel::None); 44 | const char* env_var = std::getenv("ANYDSL_PROFILE"); 45 | if (env_var) { 46 | std::string env_str = env_var; 47 | for (auto& c: env_str) 48 | c = std::toupper(c, std::locale()); 49 | std::stringstream profile_levels(env_str); 50 | std::string level; 51 | while (profile_levels >> level) { 52 | if (level == "FULL") 53 | profile.first = ProfileLevel::Full; 54 | else if (level == "FPGA_DYNAMIC") 55 | profile.second = ProfileLevel::Fpga_dynamic; 56 | } 57 | } 58 | return profile; 59 | } 60 | }; 61 | 62 | Runtime& runtime() { 63 | static RuntimeSingleton singleton; 64 | return singleton.runtime; 65 | } 66 | 67 | inline PlatformId to_platform(int32_t m) { 68 | return PlatformId(m & 0x0F); 69 | } 70 | 71 | inline DeviceId to_device(int32_t m) { 72 | return DeviceId(m >> 4); 73 | } 74 | 75 | void anydsl_info(void) { 76 | runtime().display_info(); 77 | } 78 | 79 | const char* anydsl_device_name(int32_t mask) { 80 | return runtime().device_name(to_platform(mask), to_device(mask)); 81 | } 82 | 83 | bool anydsl_device_check_feature_support(int32_t mask, const char* feature) { 84 | return runtime().device_check_feature_support(to_platform(mask), to_device(mask), feature); 85 | } 86 | 87 | void* anydsl_alloc(int32_t mask, int64_t size) { 88 | return runtime().alloc(to_platform(mask), to_device(mask), size); 89 | } 90 | 91 | void* anydsl_alloc_host(int32_t mask, int64_t size) { 92 | return runtime().alloc_host(to_platform(mask), to_device(mask), size); 93 | } 94 | 95 | void* anydsl_alloc_unified(int32_t mask, int64_t size) { 96 | return runtime().alloc_unified(to_platform(mask), to_device(mask), size); 97 | } 98 | 99 | void* anydsl_get_device_ptr(int32_t mask, void* ptr) { 100 | return runtime().get_device_ptr(to_platform(mask), to_device(mask), ptr); 101 | } 102 | 103 | void anydsl_release(int32_t mask, void* ptr) { 104 | runtime().release(to_platform(mask), to_device(mask), ptr); 105 | } 106 | 107 | void anydsl_release_host(int32_t mask, void* ptr) { 108 | runtime().release_host(to_platform(mask), to_device(mask), ptr); 109 | } 110 | 111 | void anydsl_copy( 112 | int32_t mask_src, const void* src, int64_t offset_src, 113 | int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) { 114 | runtime().copy( 115 | to_platform(mask_src), to_device(mask_src), src, offset_src, 116 | to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size); 117 | } 118 | 119 | void anydsl_launch_kernel( 120 | int32_t mask, const char* file_name, const char* kernel_name, 121 | const uint32_t* grid, const uint32_t* block, 122 | void** arg_data, 123 | const uint32_t* arg_sizes, 124 | const uint32_t* arg_aligns, 125 | const uint32_t* arg_alloc_sizes, 126 | const uint8_t* arg_types, 127 | uint32_t num_args) { 128 | LaunchParams launch_params = { 129 | file_name, 130 | kernel_name, 131 | grid, 132 | block, 133 | { 134 | arg_data, 135 | arg_sizes, 136 | arg_aligns, 137 | arg_alloc_sizes, 138 | reinterpret_cast(arg_types), 139 | }, 140 | num_args 141 | }; 142 | runtime().launch_kernel(to_platform(mask), to_device(mask), launch_params); 143 | } 144 | 145 | void anydsl_synchronize(int32_t mask) { 146 | runtime().synchronize(to_platform(mask), to_device(mask)); 147 | } 148 | 149 | uint64_t anydsl_get_micro_time() { 150 | using namespace std::chrono; 151 | return duration_cast(steady_clock::now().time_since_epoch()).count(); 152 | } 153 | 154 | uint64_t anydsl_get_nano_time() { 155 | using namespace std::chrono; 156 | return duration_cast(steady_clock::now().time_since_epoch()).count(); 157 | } 158 | 159 | uint64_t anydsl_get_kernel_time() { 160 | return runtime().kernel_time().load(); 161 | } 162 | 163 | int32_t anydsl_isinff(float x) { return std::isinf(x); } 164 | int32_t anydsl_isnanf(float x) { return std::isnan(x); } 165 | int32_t anydsl_isfinitef(float x) { return std::isfinite(x); } 166 | int32_t anydsl_isinf(double x) { return std::isinf(x); } 167 | int32_t anydsl_isnan(double x) { return std::isnan(x); } 168 | int32_t anydsl_isfinite(double x) { return std::isfinite(x); } 169 | 170 | void anydsl_print_i16(int16_t s) { std::cout << s; } 171 | void anydsl_print_i32(int32_t i) { std::cout << i; } 172 | void anydsl_print_i64(int64_t l) { std::cout << l; } 173 | void anydsl_print_u16(uint16_t s) { std::cout << s; } 174 | void anydsl_print_u32(uint32_t i) { std::cout << i; } 175 | void anydsl_print_u64(uint64_t l) { std::cout << l; } 176 | void anydsl_print_f32(float f) { std::cout << f; } 177 | void anydsl_print_f64(double d) { std::cout << d; } 178 | void anydsl_print_char(char c) { std::cout << c; } 179 | void anydsl_print_string(char* s) { std::cout << s; } 180 | void anydsl_print_flush() { std::cout << std::flush; } 181 | 182 | void* anydsl_aligned_malloc(size_t size, size_t align) { 183 | return Runtime::aligned_malloc(size, align); 184 | } 185 | 186 | void anydsl_aligned_free(void* ptr) { 187 | return Runtime::aligned_free(ptr); 188 | } 189 | 190 | #ifndef __has_feature 191 | #define __has_feature(x) 0 192 | #endif 193 | #if (defined (__clang__) && !__has_feature(cxx_thread_local)) 194 | #pragma message("Runtime random function is not thread-safe") 195 | static std::mt19937 std_gen; 196 | #else 197 | static thread_local std::mt19937 std_gen; 198 | #endif 199 | static std::uniform_real_distribution std_dist_f32; 200 | static std::uniform_int_distribution std_dist_u64; 201 | 202 | void anydsl_random_seed(uint32_t seed) { 203 | std_gen.seed(seed); 204 | } 205 | 206 | float anydsl_random_val_f32() { 207 | return std_dist_f32(std_gen); 208 | } 209 | 210 | uint64_t anydsl_random_val_u64() { 211 | return std_dist_u64(std_gen); 212 | } 213 | 214 | #ifndef AnyDSL_runtime_HAS_TBB_SUPPORT // C++11 threads version 215 | static std::unordered_map thread_pool; 216 | static std::vector free_ids; 217 | static std::mutex thread_lock; 218 | 219 | void anydsl_parallel_for(int32_t num_threads, int32_t lower, int32_t upper, void* args, void* fun) { 220 | // Get number of available hardware threads 221 | if (num_threads == 0) { 222 | num_threads = std::thread::hardware_concurrency(); 223 | // hardware_concurrency is implementation defined, may return 0 224 | num_threads = (num_threads == 0) ? 1 : num_threads; 225 | } 226 | 227 | void (*fun_ptr) (void*, int32_t, int32_t) = reinterpret_cast(fun); 228 | const int32_t linear = (upper - lower) / num_threads; 229 | 230 | // Create a pool of threads to execute the task 231 | std::vector pool(num_threads); 232 | 233 | for (int i = 0, a = lower, b = lower + linear; i < num_threads - 1; a = b, b += linear, i++) { 234 | pool[i] = std::thread([=]() { 235 | fun_ptr(args, a, b); 236 | }); 237 | } 238 | 239 | pool[num_threads - 1] = std::thread([=]() { 240 | fun_ptr(args, lower + (num_threads - 1) * linear, upper); 241 | }); 242 | 243 | // Wait for all the threads to finish 244 | for (int i = 0; i < num_threads; i++) 245 | pool[i].join(); 246 | } 247 | 248 | int32_t anydsl_spawn_thread(void* args, void* fun) { 249 | std::lock_guard lock(thread_lock); 250 | 251 | int32_t (*fun_ptr) (void*) = reinterpret_cast(fun); 252 | 253 | int32_t id; 254 | if (free_ids.size()) { 255 | id = free_ids.back(); 256 | free_ids.pop_back(); 257 | } else { 258 | id = static_cast(thread_pool.size()); 259 | } 260 | 261 | auto spawned = std::make_pair(id, std::thread([=](){ fun_ptr(args); })); 262 | thread_pool.emplace(std::move(spawned)); 263 | return id; 264 | } 265 | 266 | void anydsl_sync_thread(int32_t id) { 267 | auto thread = thread_pool.end(); 268 | { 269 | std::lock_guard lock(thread_lock); 270 | thread = thread_pool.find(id); 271 | } 272 | if (thread != thread_pool.end()) { 273 | thread->second.join(); 274 | { 275 | std::lock_guard lock(thread_lock); 276 | free_ids.push_back(thread->first); 277 | thread_pool.erase(thread); 278 | } 279 | } else { 280 | assert(0 && "Trying to synchronize on invalid thread id"); 281 | } 282 | } 283 | #else // TBB version 284 | void anydsl_parallel_for(int32_t num_threads, int32_t lower, int32_t upper, void* args, void* fun) { 285 | tbb::task_arena limited((num_threads == 0) ? tbb::task_arena::automatic : num_threads); 286 | tbb::task_group tg; 287 | 288 | void (*fun_ptr) (void*, int32_t, int32_t) = reinterpret_cast(fun); 289 | 290 | limited.execute([&] { 291 | tg.run([&] { 292 | tbb::parallel_for(tbb::blocked_range(lower, upper), 293 | [=] (const tbb::blocked_range& range) { 294 | fun_ptr(args, range.begin(), range.end()); 295 | }); 296 | }); 297 | }); 298 | 299 | limited.execute([&] { tg.wait(); }); 300 | } 301 | 302 | typedef tbb::concurrent_unordered_map> task_group_map; 303 | typedef std::pair task_group_node_ref; 304 | static task_group_map task_pool; 305 | static tbb::concurrent_queue free_ids; 306 | static std::mutex thread_lock; 307 | 308 | int32_t anydsl_spawn_thread(void* args, void* fun) { 309 | std::lock_guard lock(thread_lock); 310 | int32_t id = -1; 311 | if (!free_ids.try_pop(id)) { 312 | id = int32_t(task_pool.size()); 313 | } 314 | 315 | int32_t(*fun_ptr) (void*) = reinterpret_cast(fun); 316 | 317 | assert(id >= 0); 318 | 319 | task_group_node_ref p = task_pool.emplace(std::piecewise_construct, std::forward_as_tuple(id), std::forward_as_tuple()); 320 | tbb::task_group& tg = p.first->second; 321 | 322 | tg.run([=] { fun_ptr(args); }); 323 | 324 | return id; 325 | } 326 | 327 | void anydsl_sync_thread(int32_t id) { 328 | auto task = task_pool.end(); 329 | { 330 | std::lock_guard lock(thread_lock); 331 | task = task_pool.find(id); 332 | } 333 | if (task != task_pool.end()) { 334 | task->second.wait(); 335 | { 336 | std::lock_guard lock(thread_lock); 337 | free_ids.push(task->first); 338 | } 339 | } else { 340 | assert(0 && "Trying to synchronize on invalid task id"); 341 | } 342 | } 343 | #endif 344 | -------------------------------------------------------------------------------- /src/anydsl_runtime.h: -------------------------------------------------------------------------------- 1 | #ifndef ANYDSL_RUNTIME_H 2 | #define ANYDSL_RUNTIME_H 3 | 4 | #include 5 | #include 6 | 7 | #include "anydsl_runtime_config.h" 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | #define ANYDSL_DEVICE(p, d) ((p) | ((d) << 4)) 14 | 15 | enum { 16 | ANYDSL_HOST = 0, 17 | ANYDSL_CUDA = 1, 18 | ANYDSL_OPENCL = 2, 19 | ANYDSL_HSA = 3, 20 | ANYDSL_PAL = 4, 21 | ANYDSL_LEVELZERO = 5 22 | }; 23 | 24 | AnyDSL_runtime_API void anydsl_info(void); 25 | 26 | AnyDSL_runtime_API const char* anydsl_device_name(int32_t); 27 | AnyDSL_runtime_API bool anydsl_device_check_feature_support(int32_t, const char*); 28 | 29 | AnyDSL_runtime_API void* anydsl_alloc(int32_t, int64_t); 30 | AnyDSL_runtime_API void* anydsl_alloc_host(int32_t, int64_t); 31 | AnyDSL_runtime_API void* anydsl_alloc_unified(int32_t, int64_t); 32 | AnyDSL_runtime_API void* anydsl_get_device_ptr(int32_t, void*); 33 | AnyDSL_runtime_API void anydsl_release(int32_t, void*); 34 | AnyDSL_runtime_API void anydsl_release_host(int32_t, void*); 35 | 36 | AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t); 37 | 38 | AnyDSL_runtime_API void anydsl_launch_kernel( 39 | int32_t, const char*, const char*, 40 | const uint32_t*, const uint32_t*, 41 | void**, const uint32_t*, const uint32_t*, const uint32_t*, const uint8_t*, 42 | uint32_t); 43 | AnyDSL_runtime_API void anydsl_synchronize(int32_t); 44 | 45 | AnyDSL_runtime_API void anydsl_random_seed(uint32_t); 46 | AnyDSL_runtime_API float anydsl_random_val_f32(); 47 | AnyDSL_runtime_API uint64_t anydsl_random_val_u64(); 48 | 49 | AnyDSL_runtime_API uint64_t anydsl_get_micro_time(); 50 | AnyDSL_runtime_API uint64_t anydsl_get_nano_time(); 51 | AnyDSL_runtime_API uint64_t anydsl_get_kernel_time(); 52 | 53 | AnyDSL_runtime_API int32_t anydsl_isinff(float); 54 | AnyDSL_runtime_API int32_t anydsl_isnanf(float); 55 | AnyDSL_runtime_API int32_t anydsl_isfinitef(float); 56 | AnyDSL_runtime_API int32_t anydsl_isinf(double); 57 | AnyDSL_runtime_API int32_t anydsl_isnan(double); 58 | AnyDSL_runtime_API int32_t anydsl_isfinite(double); 59 | 60 | AnyDSL_runtime_API void anydsl_print_i16(int16_t); 61 | AnyDSL_runtime_API void anydsl_print_i32(int32_t); 62 | AnyDSL_runtime_API void anydsl_print_i64(int64_t); 63 | AnyDSL_runtime_API void anydsl_print_u16(uint16_t); 64 | AnyDSL_runtime_API void anydsl_print_u32(uint32_t); 65 | AnyDSL_runtime_API void anydsl_print_u64(uint64_t); 66 | AnyDSL_runtime_API void anydsl_print_f32(float); 67 | AnyDSL_runtime_API void anydsl_print_f64(double); 68 | AnyDSL_runtime_API void anydsl_print_char(char); 69 | AnyDSL_runtime_API void anydsl_print_string(char*); 70 | AnyDSL_runtime_API void anydsl_print_flush(); 71 | 72 | AnyDSL_runtime_API void* anydsl_aligned_malloc(size_t, size_t); 73 | AnyDSL_runtime_API void anydsl_aligned_free(void*); 74 | 75 | AnyDSL_runtime_API void anydsl_parallel_for(int32_t, int32_t, int32_t, void*, void*); 76 | AnyDSL_runtime_API int32_t anydsl_spawn_thread(void*, void*); 77 | AnyDSL_runtime_API void anydsl_sync_thread(int32_t); 78 | 79 | struct AnyDSL_runtime_API Closure { 80 | void (*fn)(uint64_t); 81 | uint64_t payload; 82 | }; 83 | 84 | AnyDSL_runtime_API int32_t anydsl_create_graph(); 85 | AnyDSL_runtime_API int32_t anydsl_create_task(int32_t, Closure); 86 | AnyDSL_runtime_API void anydsl_create_edge(int32_t, int32_t); 87 | AnyDSL_runtime_API void anydsl_execute_graph(int32_t, int32_t); 88 | 89 | #ifdef __cplusplus 90 | } 91 | #include "anydsl_runtime.hpp" 92 | #endif 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /src/anydsl_runtime.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ANYDSL_RUNTIME_HPP 2 | #define ANYDSL_RUNTIME_HPP 3 | 4 | #ifndef ANYDSL_RUNTIME_H 5 | #include "anydsl_runtime.h" 6 | #endif 7 | 8 | namespace anydsl { 9 | 10 | enum class Platform : int32_t { 11 | Host = ANYDSL_HOST, 12 | Cuda = ANYDSL_CUDA, 13 | OpenCL = ANYDSL_OPENCL, 14 | HSA = ANYDSL_HSA, 15 | PAL = ANYDSL_PAL, 16 | LevelZero = ANYDSL_LEVELZERO 17 | }; 18 | 19 | struct Device { 20 | Device(int32_t id) : id(id) {} 21 | int32_t id; 22 | }; 23 | 24 | inline int32_t make_device(Platform p, Device d) { 25 | return ANYDSL_DEVICE((int32_t)p, d.id); 26 | } 27 | 28 | template 29 | class Array { 30 | public: 31 | Array() 32 | : data_(nullptr), size_(0), dev_(0) 33 | {} 34 | 35 | Array(int64_t size) 36 | : Array(Platform::Host, Device(0), size) 37 | {} 38 | 39 | Array(int32_t dev, T* ptr, int64_t size) 40 | : data_(ptr), size_(size), dev_(dev) 41 | {} 42 | 43 | Array(Platform p, Device d, int64_t size) 44 | : dev_(make_device(p, d)) { 45 | allocate(size); 46 | } 47 | 48 | Array(Array&& other) 49 | : data_(other.data_), 50 | size_(other.size_), 51 | dev_(other.dev_) { 52 | other.data_ = nullptr; 53 | } 54 | 55 | Array& operator = (Array&& other) { 56 | deallocate(); 57 | dev_ = other.dev_; 58 | size_ = other.size_; 59 | data_ = other.data_; 60 | other.data_ = nullptr; 61 | return *this; 62 | } 63 | 64 | Array(const Array&) = delete; 65 | Array& operator = (const Array&) = delete; 66 | 67 | ~Array() { deallocate(); } 68 | 69 | T* begin() { return data_; } 70 | const T* begin() const { return data_; } 71 | 72 | T* end() { return data_ + size_; } 73 | const T* end() const { return data_ + size_; } 74 | 75 | T* data() { return data_; } 76 | const T* data() const { return data_; } 77 | 78 | int64_t size() const { return size_; } 79 | int32_t device() const { return dev_; } 80 | 81 | const T& operator [] (int i) const { return data_[i]; } 82 | T& operator [] (int i) { return data_[i]; } 83 | 84 | T* release() { 85 | T* ptr = data_; 86 | data_ = nullptr; 87 | size_ = 0; 88 | dev_ = 0; 89 | return ptr; 90 | } 91 | 92 | protected: 93 | void allocate(int64_t size) { 94 | size_ = size; 95 | data_ = (T*)anydsl_alloc(dev_, sizeof(T) * size); 96 | } 97 | 98 | void deallocate() { 99 | if (data_) anydsl_release(dev_, (void*)data_); 100 | } 101 | 102 | T* data_; 103 | int64_t size_; 104 | int32_t dev_; 105 | }; 106 | 107 | template 108 | void copy(const Array& a, Array& b) { 109 | anydsl_copy(a.device(), (const void*)a.data(), 0, 110 | b.device(), (void*)b.data(), 0, 111 | a.size() * sizeof(T)); 112 | } 113 | 114 | template 115 | void copy(const Array& a, Array& b, int64_t size) { 116 | anydsl_copy(a.device(), (const void*)a.data(), 0, 117 | b.device(), (void*)b.data(), 0, 118 | size * sizeof(T)); 119 | } 120 | 121 | template 122 | void copy(const Array& a, int64_t offset_a, Array& b, int64_t offset_b, int64_t size) { 123 | anydsl_copy(a.device(), (const void*)a.data(), offset_a * sizeof(T), 124 | b.device(), (void*)b.data(), offset_b * sizeof(T), 125 | size * sizeof(T)); 126 | } 127 | 128 | } // namespace anydsl 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /src/anydsl_runtime_config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef ANYDSL_RUNTIME_CONFIG_H 2 | #define ANYDSL_RUNTIME_CONFIG_H 3 | 4 | // AnyDSL runtime feature support 5 | 6 | #cmakedefine AnyDSL_runtime_BUILD_SHARED 7 | #cmakedefine AnyDSL_runtime_HAS_LLVM_SUPPORT 8 | #cmakedefine AnyDSL_runtime_HAS_JIT_SUPPORT 9 | #cmakedefine AnyDSL_runtime_HAS_CUDA_SUPPORT 10 | #cmakedefine AnyDSL_runtime_HAS_OPENCL_SUPPORT 11 | #cmakedefine AnyDSL_runtime_HAS_LEVELZERO_SUPPORT 12 | #cmakedefine AnyDSL_runtime_HAS_HSA_SUPPORT 13 | #cmakedefine AnyDSL_runtime_HAS_PAL_SUPPORT 14 | #cmakedefine AnyDSL_runtime_HAS_TBB_SUPPORT 15 | 16 | 17 | #if defined(AnyDSL_runtime_BUILD_SHARED) 18 | # ifdef _MSC_VER 19 | # define __dll_import __declspec(dllimport) 20 | # define __dll_export __declspec(dllexport) 21 | # else // _MSC_VER 22 | # define __dll_import __attribute__((visibility("default"))) 23 | # define __dll_export __attribute__((visibility("default"))) 24 | # endif // _MSC_VER 25 | # ifdef AnyDSL_runtime_EXPORTS 26 | # define AnyDSL_runtime_API __dll_export 27 | # else // AnyDSL_runtime_EXPORTS 28 | # define AnyDSL_runtime_API __dll_import 29 | # endif // AnyDSL_runtime_EXPORTS 30 | # ifdef AnyDSL_runtime_jit_EXPORTS 31 | # define AnyDSL_runtime_jit_API __dll_export 32 | # else // AnyDSL_runtime_jit_EXPORTS 33 | # define AnyDSL_runtime_jit_API __dll_import 34 | # endif // AnyDSL_runtime_jit_EXPORTS 35 | #else // AnyDSL_runtime_BUILD_SHARED 36 | # define AnyDSL_runtime_API 37 | # define AnyDSL_runtime_jit_API 38 | #endif // AnyDSL_runtime_BUILD_SHARED 39 | 40 | 41 | // CUDA support 42 | 43 | #cmakedefine AnyDSL_runtime_CUDA_CXX_STANDARD @AnyDSL_runtime_CUDA_CXX_STANDARD@ 44 | #define AnyDSL_runtime_LIBDEVICE_LIB "@AnyDSL_runtime_LIBDEVICE_LIB@" 45 | #define AnyDSL_runtime_NVCC_INC "@AnyDSL_runtime_NVCC_INC@" 46 | 47 | // HSA support 48 | 49 | #define AnyDSL_runtime_HSA_BITCODE_PATH "@AnyDSL_runtime_HSA_BITCODE_PATH@/" 50 | #define AnyDSL_runtime_HSA_BITCODE_SUFFIX "@AnyDSL_runtime_HSA_BITCODE_SUFFIX@" 51 | 52 | // PAL support 53 | 54 | #define AnyDSL_runtime_PAL_BITCODE_PATH "@AnyDSL_runtime_PAL_BITCODE_PATH@/" 55 | #define AnyDSL_runtime_PAL_BITCODE_SUFFIX "@AnyDSL_runtime_PAL_BITCODE_SUFFIX@" 56 | 57 | // jit support 58 | 59 | #define AnyDSL_runtime_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@" 60 | 61 | // debug output 62 | 63 | #cmakedefine AnyDSL_runtime_ENABLE_DEBUG_OUTPUT 64 | 65 | 66 | #endif // ANYDSL_RUNTIME_CONFIG_H 67 | -------------------------------------------------------------------------------- /src/cpu_platform.cpp: -------------------------------------------------------------------------------- 1 | #include "cpu_platform.h" 2 | #include "runtime.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #if defined(__APPLE__) 11 | #include 12 | #include 13 | #elif defined(_WIN32) 14 | #define WIN32_LEAN_AND_MEAN 15 | #define NOMINMAX 16 | #include 17 | #endif 18 | 19 | CpuPlatform::CpuPlatform(Runtime* runtime) 20 | : Platform(runtime) 21 | { 22 | #if defined(__APPLE__) 23 | size_t buf_len; 24 | sysctlbyname("machdep.cpu.brand_string", nullptr, &buf_len, nullptr, 0); 25 | device_name_.resize(buf_len, '\0'); 26 | sysctlbyname("machdep.cpu.brand_string", device_name_.data(), &buf_len, nullptr, 0); 27 | #elif defined(_WIN32) 28 | HKEY key; 29 | if (RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0U, KEY_QUERY_VALUE, &key) != ERROR_SUCCESS) 30 | error("failed to open processor information registry key"); 31 | 32 | DWORD cpu_name_type, cpu_name_size; 33 | if (RegQueryValueExW(key, L"ProcessorNameString", nullptr, &cpu_name_type, nullptr, &cpu_name_size) != ERROR_SUCCESS) 34 | error("failed to query processor name string length"); 35 | 36 | if (cpu_name_type != REG_SZ) 37 | error("unexpected type for processor name string"); 38 | 39 | int cpu_name_length = cpu_name_size / sizeof(wchar_t); 40 | 41 | std::wstring buffer(cpu_name_length, '\0'); 42 | if (RegQueryValueExW(key, L"ProcessorNameString", nullptr, &cpu_name_type, reinterpret_cast(buffer.data()), &cpu_name_size) != ERROR_SUCCESS) 43 | error("failed to query processor name string"); 44 | 45 | RegCloseKey(key); 46 | 47 | int u8_cpu_name_length = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, buffer.data(), cpu_name_length, nullptr, 0, nullptr, nullptr); 48 | 49 | if (u8_cpu_name_length <= 0) 50 | error("failed to compute converted UTF-8 CPU name string length"); 51 | 52 | device_name_.resize(u8_cpu_name_length, '\0'); 53 | 54 | if (WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, buffer.data(), cpu_name_length, device_name_.data(), u8_cpu_name_length, nullptr, nullptr) <= 0) 55 | error("failed to convert CPU name string to UTF-8"); 56 | #else 57 | std::ifstream cpuinfo("/proc/cpuinfo"); 58 | 59 | if (!cpuinfo) 60 | error("failed to open /proc/cpuinfo"); 61 | 62 | #if defined __arm__ || __aarch64__ 63 | std::string model_string = "CPU part\t: "; 64 | #else // x86, x86_64 65 | std::string model_string = "model name\t: "; 66 | #endif 67 | 68 | std::search(std::istreambuf_iterator(cpuinfo), {}, model_string.begin(), model_string.end()); 69 | std::getline(cpuinfo >> std::ws, device_name_); 70 | #endif 71 | } 72 | -------------------------------------------------------------------------------- /src/cpu_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef CPU_PLATFORM_H 2 | #define CPU_PLATFORM_H 3 | 4 | #include "platform.h" 5 | 6 | #ifndef PAGE_SIZE 7 | #define PAGE_SIZE 4096 8 | #endif 9 | 10 | #include 11 | 12 | /// CPU platform, allocation is guaranteed to be aligned to page size: 4096 bytes. 13 | class CpuPlatform : public Platform { 14 | public: 15 | CpuPlatform(Runtime* runtime); 16 | 17 | protected: 18 | void* alloc(DeviceId, int64_t size) override { 19 | return Runtime::aligned_malloc(size, 32); 20 | } 21 | 22 | void* alloc_host(DeviceId, int64_t size) override { 23 | return Runtime::aligned_malloc(size, PAGE_SIZE); 24 | } 25 | 26 | void* alloc_unified(DeviceId, int64_t size) override { 27 | return Runtime::aligned_malloc(size, PAGE_SIZE); 28 | } 29 | 30 | void* get_device_ptr(DeviceId, void* ptr) override { 31 | return ptr; 32 | } 33 | 34 | void release(DeviceId, void* ptr) override { 35 | Runtime::aligned_free(ptr); 36 | } 37 | 38 | void release_host(DeviceId dev, void* ptr) override { 39 | release(dev, ptr); 40 | } 41 | 42 | void no_kernel() { 43 | error("Kernels are not supported on the CPU"); 44 | } 45 | 46 | void launch_kernel(DeviceId, const LaunchParams&) override { no_kernel(); } 47 | void synchronize(DeviceId) override { no_kernel(); } 48 | 49 | void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { 50 | memcpy((char*)dst + offset_dst, (char*)src + offset_src, size); 51 | } 52 | 53 | void copy(DeviceId, const void* src, int64_t offset_src, 54 | DeviceId, void* dst, int64_t offset_dst, int64_t size) override { 55 | copy(src, offset_src, dst, offset_dst, size); 56 | } 57 | void copy_from_host(const void* src, int64_t offset_src, DeviceId, 58 | void* dst, int64_t offset_dst, int64_t size) override { 59 | copy(src, offset_src, dst, offset_dst, size); 60 | } 61 | void copy_to_host(DeviceId, const void* src, int64_t offset_src, 62 | void* dst, int64_t offset_dst, int64_t size) override { 63 | copy(src, offset_src, dst, offset_dst, size); 64 | } 65 | 66 | std::string device_name_; 67 | size_t dev_count() const override { return 1; } 68 | std::string name() const override { return "CPU"; } 69 | const char* device_name(DeviceId) const override { return device_name_.c_str(); } 70 | bool device_check_feature_support(DeviceId, const char*) const override { return false; } 71 | }; 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/cuda_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_PLATFORM_H 2 | #define CUDA_PLATFORM_H 3 | 4 | #include "platform.h" 5 | #include "runtime.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define CUDA_API_PER_THREAD_DEFAULT_STREAM 15 | #include 16 | #include 17 | #include 18 | 19 | #if CUDA_VERSION < 10000 20 | #error "CUDA 10.0 or higher required!" 21 | #endif 22 | 23 | /// CUDA platform. Has the same number of devices as that of the CUDA implementation. 24 | class CudaPlatform : public Platform { 25 | public: 26 | CudaPlatform(Runtime* runtime); 27 | ~CudaPlatform(); 28 | 29 | protected: 30 | void* alloc(DeviceId dev, int64_t size) override; 31 | void* alloc_host(DeviceId dev, int64_t size) override; 32 | void* alloc_unified(DeviceId dev, int64_t size) override; 33 | void* get_device_ptr(DeviceId, void* ptr) override; 34 | void release(DeviceId dev, void* ptr) override; 35 | void release_host(DeviceId dev, void* ptr) override; 36 | 37 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; 38 | void synchronize(DeviceId dev) override; 39 | 40 | void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; 41 | void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; 42 | void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; 43 | 44 | size_t dev_count() const override { return devices_.size(); } 45 | std::string name() const override { return "CUDA"; } 46 | const char* device_name(DeviceId dev) const override; 47 | bool device_check_feature_support(DeviceId dev, const char* feature) const override; 48 | 49 | typedef std::unordered_map FunctionMap; 50 | 51 | struct DeviceData { 52 | CUdevice dev; 53 | CUcontext ctx; 54 | CUjit_target compute_capability; 55 | std::atomic_flag locked = ATOMIC_FLAG_INIT; 56 | std::unordered_map modules; 57 | std::unordered_map functions; 58 | std::string name; 59 | 60 | DeviceData() {} 61 | DeviceData(const DeviceData&) = delete; 62 | DeviceData(DeviceData&& data) 63 | : dev(data.dev) 64 | , ctx(data.ctx) 65 | , compute_capability(data.compute_capability) 66 | , modules(std::move(data.modules)) 67 | , functions(std::move(data.functions)) 68 | , name(std::move(name)) 69 | {} 70 | 71 | void lock() { 72 | while (locked.test_and_set(std::memory_order_acquire)) ; 73 | } 74 | 75 | void unlock() { 76 | locked.clear(std::memory_order_release); 77 | } 78 | }; 79 | 80 | std::vector devices_; 81 | 82 | bool dump_binaries = false; 83 | 84 | struct ProfileData { 85 | CudaPlatform* platform; 86 | CUcontext ctx; 87 | CUevent start; 88 | CUevent end; 89 | }; 90 | 91 | std::mutex profile_lock_; 92 | std::forward_list profiles_; 93 | void erase_profiles(bool); 94 | 95 | CUfunction load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname); 96 | 97 | std::string compile_nvptx(DeviceId dev, const std::string& filename, const std::string& program_string) const; 98 | std::string compile_nvvm(DeviceId dev, const std::string& filename, const std::string& program_string) const; 99 | std::string compile_cuda(DeviceId dev, const std::string& filename, const std::string& program_string) const; 100 | CUmodule create_module(DeviceId dev, const std::string& filename, const std::string& ptx_string) const; 101 | }; 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /src/dummy_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef DUMMY_PLATFORM_H 2 | #define DUMMY_PLATFORM_H 3 | 4 | #include "platform.h" 5 | #include "runtime.h" 6 | 7 | #include 8 | 9 | /// Dummy platform, implemented 10 | class DummyPlatform : public Platform { 11 | public: 12 | DummyPlatform(Runtime* runtime, const std::string& name) 13 | : Platform(runtime), name_(name) 14 | {} 15 | 16 | protected: 17 | void* alloc(DeviceId, int64_t) override { platform_error(); } 18 | void* alloc_host(DeviceId, int64_t) override { platform_error(); } 19 | void* alloc_unified(DeviceId, int64_t) override { platform_error(); } 20 | void* get_device_ptr(DeviceId, void*) override { platform_error(); } 21 | void release(DeviceId, void*) override { platform_error(); } 22 | void release_host(DeviceId, void*) override { platform_error(); } 23 | 24 | void launch_kernel(DeviceId, const LaunchParams&) override { platform_error(); } 25 | void synchronize(DeviceId) override { platform_error(); } 26 | 27 | void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); } 28 | void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); } 29 | void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t) override { platform_error(); } 30 | 31 | size_t dev_count() const override { return 0; } 32 | std::string name() const override { return name_; } 33 | const char* device_name(DeviceId) const override { return "Dummy"; } 34 | bool device_check_feature_support(DeviceId, const char*) const override { return false; } 35 | 36 | std::string name_; 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/extract_runtime_srcs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | def main(): 5 | col, maxcols = 0, 10 6 | for f in sys.argv[1:]: 7 | with open(f, "r") as fd: 8 | for b in fd.read(): 9 | sys.stdout.write("{:3}, ".format(ord(b))) 10 | col += 1 11 | if col == maxcols: 12 | sys.stdout.write("\n") 13 | col = 0 14 | 15 | if __name__ == "__main__": 16 | main() 17 | -------------------------------------------------------------------------------- /src/hsa_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef HSA_PLATFORM_H 2 | #define HSA_PLATFORM_H 3 | 4 | #include "platform.h" 5 | #include "runtime.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | namespace llvm { 16 | class OptimizationLevel; 17 | } 18 | 19 | /// HSA platform. Has the same number of devices as that of the HSA implementation. 20 | class HSAPlatform : public Platform { 21 | public: 22 | HSAPlatform(Runtime* runtime); 23 | ~HSAPlatform(); 24 | 25 | protected: 26 | void* alloc(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].amd_coarsegrained_pool); } 27 | void* alloc_host(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].amd_coarsegrained_pool); } 28 | void* alloc_unified(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].finegrained_region); } 29 | void* get_device_ptr(DeviceId, void* ptr) override { return ptr; } 30 | void release(DeviceId dev, void* ptr) override; 31 | void release_host(DeviceId dev, void* ptr) override { release(dev, ptr); } 32 | 33 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; 34 | void synchronize(DeviceId dev) override; 35 | 36 | void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size); 37 | void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); } 38 | void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); } 39 | void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); } 40 | 41 | size_t dev_count() const override { return devices_.size(); } 42 | std::string name() const override { return "HSA"; } 43 | const char* device_name(DeviceId dev) const override; 44 | bool device_check_feature_support(DeviceId, const char*) const override { return false; } 45 | 46 | struct KernelInfo { 47 | uint64_t kernel; 48 | uint32_t kernarg_segment_size; 49 | uint32_t group_segment_size; 50 | uint32_t private_segment_size; 51 | void* kernarg_segment; 52 | }; 53 | 54 | typedef std::unordered_map KernelMap; 55 | 56 | struct DeviceData { 57 | hsa_agent_t agent; 58 | hsa_profile_t profile; 59 | hsa_default_float_rounding_mode_t float_mode; 60 | std::string isa; 61 | hsa_queue_t* queue; 62 | hsa_signal_t signal; 63 | hsa_region_t kernarg_region, finegrained_region, coarsegrained_region; 64 | hsa_amd_memory_pool_t amd_kernarg_pool, amd_finegrained_pool, amd_coarsegrained_pool; 65 | std::atomic_flag locked = ATOMIC_FLAG_INIT; 66 | std::unordered_map programs; 67 | std::unordered_map kernels; 68 | std::string name; 69 | 70 | DeviceData() {} 71 | DeviceData(const DeviceData&) = delete; 72 | DeviceData(DeviceData&& data) 73 | : agent(data.agent) 74 | , profile(data.profile) 75 | , float_mode(data.float_mode) 76 | , isa(data.isa) 77 | , queue(data.queue) 78 | , signal(data.signal) 79 | , kernarg_region(data.kernarg_region) 80 | , finegrained_region(data.finegrained_region) 81 | , coarsegrained_region(data.coarsegrained_region) 82 | , amd_kernarg_pool(data.amd_kernarg_pool) 83 | , amd_finegrained_pool(data.amd_finegrained_pool) 84 | , amd_coarsegrained_pool(data.amd_finegrained_pool) 85 | , programs(std::move(data.programs)) 86 | , kernels(std::move(data.kernels)) 87 | , name(data.name) 88 | {} 89 | 90 | void lock() { 91 | while (locked.test_and_set(std::memory_order_acquire)) ; 92 | } 93 | 94 | void unlock() { 95 | locked.clear(std::memory_order_release); 96 | } 97 | }; 98 | 99 | uint64_t frequency_; 100 | std::vector devices_; 101 | 102 | void* alloc_hsa(int64_t, hsa_region_t); 103 | void* alloc_hsa(int64_t, hsa_amd_memory_pool_t); 104 | static hsa_status_t iterate_agents_callback(hsa_agent_t, void*); 105 | static hsa_status_t iterate_regions_callback(hsa_region_t, void*); 106 | static hsa_status_t iterate_memory_pools_callback(hsa_amd_memory_pool_t, void*); 107 | KernelInfo& load_kernel(DeviceId, const std::string&, const std::string&); 108 | std::string compile_gcn(DeviceId, const std::string&, const std::string&) const; 109 | std::string emit_gcn(const std::string&, const std::string&, const std::string&, llvm::OptimizationLevel) const; 110 | }; 111 | 112 | #endif 113 | -------------------------------------------------------------------------------- /src/jit.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "anydsl_jit.h" 21 | #include "log.h" 22 | #include "runtime.h" 23 | 24 | bool compile( 25 | const std::vector& file_names, 26 | const std::vector& file_data, 27 | thorin::World& world, 28 | std::ostream& error_stream); 29 | 30 | static const char runtime_srcs[] = { 31 | #include "runtime_srcs.inc" 32 | 0 33 | }; 34 | 35 | struct JIT { 36 | struct Program { 37 | Program(llvm::ExecutionEngine* engine) : engine(engine) {} 38 | llvm::ExecutionEngine* engine; 39 | }; 40 | 41 | std::vector programs; 42 | Runtime* runtime; 43 | thorin::LogLevel log_level; 44 | 45 | JIT(Runtime* runtime) : runtime(runtime), log_level(thorin::LogLevel::Warn) { 46 | llvm::InitializeNativeTarget(); 47 | llvm::InitializeNativeTargetAsmPrinter(); 48 | } 49 | 50 | int32_t compile(const char* program_src, uint32_t size, uint32_t opt) { 51 | // The LLVM context and module have to be alive for the duration of this function 52 | std::unique_ptr llvm_context; 53 | std::unique_ptr llvm_module; 54 | 55 | size_t prog_key = std::hash{}(program_src); 56 | std::stringstream hex_stream; 57 | hex_stream << std::hex << prog_key; 58 | std::string program_str = std::string(program_src, size); 59 | std::string cached_llvm = runtime->load_from_cache(program_str, ".llvm"); 60 | std::string module_name = "jit_" + hex_stream.str(); 61 | if (cached_llvm.empty()) { 62 | bool debug = false; 63 | assert(opt <= 3); 64 | 65 | thorin::Thorin thorin(module_name); 66 | thorin.world().set(log_level); 67 | thorin.world().set(std::make_shared(std::cerr)); 68 | if (!::compile( 69 | { "runtime", module_name }, 70 | { std::string(runtime_srcs), program_str }, 71 | thorin.world(), std::cerr)) 72 | error("JIT: error while compiling sources"); 73 | 74 | thorin.opt(); 75 | 76 | std::string host_triple, host_cpu, host_attr, hls_flags; 77 | thorin::DeviceBackends backends(thorin.world(), opt, debug, hls_flags); 78 | 79 | thorin::llvm::CPUCodeGen cg(thorin, opt, debug, host_triple, host_cpu, host_attr); 80 | std::tie(llvm_context, llvm_module) = cg.emit_module(); 81 | std::stringstream stream; 82 | llvm::raw_os_ostream llvm_stream(stream); 83 | llvm_module->print(llvm_stream, nullptr); 84 | runtime->store_to_cache(program_str, stream.str(), ".llvm"); 85 | 86 | for (auto& cg : backends.cgs) { 87 | if (cg) { 88 | if (std::string(cg->file_ext()) == ".hls") 89 | error("JIT compilation of hls not supported!"); 90 | std::ostringstream stream; 91 | cg->emit_stream(stream); 92 | runtime->store_to_cache(cg->file_ext() + program_str, stream.str(), cg->file_ext()); 93 | runtime->register_file(module_name + cg->file_ext(), stream.str()); 94 | } 95 | } 96 | } else { 97 | llvm::SMDiagnostic diagnostic_err; 98 | llvm_context = std::make_unique(); 99 | llvm_module = llvm::parseIR(llvm::MemoryBuffer::getMemBuffer(cached_llvm)->getMemBufferRef(), diagnostic_err, *llvm_context); 100 | 101 | auto load_backend_src = [&](std::string ext) { 102 | std::string cached_src = runtime->load_from_cache(ext + program_str, ext); 103 | if (!cached_src.empty()) 104 | runtime->register_file(module_name + ext, cached_src); 105 | }; 106 | load_backend_src(".cl"); 107 | load_backend_src(".cu"); 108 | load_backend_src(".nvvm"); 109 | load_backend_src(".amdgpu"); 110 | } 111 | 112 | llvm::TargetOptions options; 113 | options.AllowFPOpFusion = llvm::FPOpFusion::Fast; 114 | 115 | auto engine = llvm::EngineBuilder(std::move(llvm_module)) 116 | .setEngineKind(llvm::EngineKind::JIT) 117 | .setMCPU(llvm::sys::getHostCPUName()) 118 | .setTargetOptions(options) 119 | .setOptLevel( opt == 0 ? llvm::CodeGenOptLevel::None : 120 | opt == 1 ? llvm::CodeGenOptLevel::Less : 121 | opt == 2 ? llvm::CodeGenOptLevel::Default : 122 | /* opt == 3 */ llvm::CodeGenOptLevel::Aggressive) 123 | .create(); 124 | if (!engine) 125 | return -1; 126 | 127 | engine->finalizeObject(); 128 | programs.push_back(Program(engine)); 129 | 130 | return (int32_t)programs.size() - 1; 131 | } 132 | 133 | void* lookup_function(int32_t key, const char* fn_name) { 134 | if (key == -1) 135 | return nullptr; 136 | 137 | return (void *)programs[key].engine->getFunctionAddress(fn_name); 138 | } 139 | 140 | void link(const char* lib) { 141 | llvm::sys::DynamicLibrary::LoadLibraryPermanently(lib); 142 | } 143 | }; 144 | 145 | JIT& jit() { 146 | static std::unique_ptr jit(new JIT(&runtime())); 147 | return *jit; 148 | } 149 | 150 | void anydsl_set_cache_directory(const char* dir) { 151 | jit().runtime->set_cache_directory(dir == nullptr ? std::string() : dir); 152 | } 153 | 154 | const char* anydsl_get_cache_directory() { 155 | static std::string dir; 156 | dir = jit().runtime->get_cache_directory(); 157 | return dir.c_str(); 158 | } 159 | 160 | void anydsl_link(const char* lib) { 161 | jit().link(lib); 162 | } 163 | 164 | int32_t anydsl_compile(const char* program, uint32_t size, uint32_t opt) { 165 | return jit().compile(program, size, opt); 166 | } 167 | 168 | void anydsl_set_log_level(uint32_t log_level) { 169 | jit().log_level = log_level <= 4 ? static_cast(log_level) : thorin::LogLevel::Warn; 170 | } 171 | 172 | void* anydsl_lookup_function(int32_t key, const char* fn_name) { 173 | return jit().lookup_function(key, fn_name); 174 | } 175 | -------------------------------------------------------------------------------- /src/levelzero_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef LEVEL_ZERO_PLATFORM_H 2 | #define LEVEL_ZERO_PLATFORM_H 3 | 4 | #include "platform.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | 14 | /// oneAPI Level Zero platform 15 | class LevelZeroPlatform : public Platform { 16 | public: 17 | LevelZeroPlatform(Runtime* runtime); 18 | ~LevelZeroPlatform(); 19 | 20 | protected: 21 | void* alloc(DeviceId dev, int64_t size) override; 22 | void* alloc_host(DeviceId, int64_t) override; 23 | void* alloc_unified(DeviceId, int64_t) override; 24 | void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); } 25 | void release(DeviceId dev, void* ptr) override; 26 | void release_host(DeviceId, void*) override; 27 | 28 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; 29 | void synchronize(DeviceId dev) override; 30 | 31 | void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; 32 | void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; 33 | void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; 34 | 35 | size_t dev_count() const override { return devices_.size(); } 36 | std::string name() const override { return "oneAPI Level Zero"; } 37 | const char* device_name(DeviceId dev) const override; 38 | bool device_check_feature_support(DeviceId, const char*) const override { return false; } 39 | 40 | typedef std::unordered_map KernelMap; 41 | 42 | struct DeviceData { 43 | LevelZeroPlatform* parent; 44 | ze_driver_handle_t driver; 45 | ze_device_handle_t device; 46 | std::string device_name; 47 | ze_command_list_handle_t queue = nullptr; 48 | ze_context_handle_t ctx = nullptr; 49 | std::unordered_map modules; 50 | std::unordered_map kernels; 51 | double timerResolution; 52 | 53 | DeviceData( 54 | LevelZeroPlatform* parent, 55 | ze_driver_handle_t driver, 56 | ze_device_handle_t device, 57 | const std::string& device_name) 58 | : parent(parent) 59 | , driver(driver) 60 | , device(device) 61 | , device_name(device_name) 62 | {} 63 | DeviceData(DeviceData&&) = default; 64 | DeviceData(const DeviceData&) = delete; 65 | }; 66 | 67 | std::vector devices_; 68 | std::vector contexts_; 69 | 70 | ze_kernel_handle_t load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname); 71 | friend void determineDeviceCapabilities(ze_device_handle_t hDevice, LevelZeroPlatform::DeviceData& device); 72 | }; 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/log.h: -------------------------------------------------------------------------------- 1 | #ifndef LOG_H 2 | #define LOG_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | inline void unused() {} 10 | template 11 | inline void unused(const T& t, Args... args) { (void)t; unused(args...); } 12 | 13 | inline void print(std::ostream& os, const char* fmt) { 14 | assert(!strchr(fmt, '%') && "Not enough arguments to print"); 15 | os << fmt << std::endl; 16 | } 17 | 18 | template 19 | void print(std::ostream& os, const char* fmt, const T& t, Args... args) { 20 | auto ptr = strchr(fmt, '%'); 21 | while (ptr && ptr[1] == '%') ptr = strchr(ptr + 2, '%'); 22 | assert(ptr && "Too many arguments to print"); 23 | os.write(fmt, ptr - fmt); 24 | os << t; 25 | print(os, ptr + 1, args...); 26 | } 27 | 28 | template 29 | [[noreturn]] void error(Args... args) { 30 | print(std::cerr, args...); 31 | std::abort(); 32 | } 33 | 34 | template 35 | void info(Args... args) { 36 | print(std::cout, args...); 37 | } 38 | 39 | template 40 | void debug(Args... args) { 41 | #ifdef AnyDSL_runtime_ENABLE_DEBUG_OUTPUT 42 | print(std::cout, args...); 43 | #else 44 | unused(args...); 45 | #endif 46 | } 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/opencl_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef OPENCL_PLATFORM_H 2 | #define OPENCL_PLATFORM_H 3 | 4 | #include "platform.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef __APPLE__ 12 | #include 13 | #include 14 | #else 15 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 16 | #include 17 | #include 18 | #endif 19 | 20 | /// OpenCL platform. Has the same number of devices as that of the OpenCL implementation. 21 | class OpenCLPlatform : public Platform { 22 | public: 23 | OpenCLPlatform(Runtime* runtime); 24 | ~OpenCLPlatform(); 25 | 26 | protected: 27 | void* alloc(DeviceId dev, int64_t size) override; 28 | void* alloc_host(DeviceId, int64_t) override { command_unavailable("alloc_host"); } 29 | void* alloc_unified(DeviceId, int64_t) override; 30 | void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); } 31 | void release(DeviceId dev, void* ptr) override; 32 | void release_host(DeviceId, void*) override { command_unavailable("release_host"); } 33 | 34 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; 35 | void synchronize(DeviceId dev) override; 36 | 37 | void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; 38 | void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; 39 | void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; 40 | void copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size); 41 | void dynamic_profile(DeviceId dev, const std::string& filename); 42 | 43 | size_t dev_count() const override { return devices_.size(); } 44 | std::string name() const override { return "OpenCL"; } 45 | const char* device_name(DeviceId dev) const override; 46 | bool device_check_feature_support(DeviceId, const char*) const override { return false; } 47 | 48 | typedef std::unordered_map KernelMap; 49 | 50 | struct DeviceData { 51 | OpenCLPlatform* parent; 52 | cl_platform_id platform; 53 | cl_device_id dev; 54 | cl_uint version_major; 55 | cl_uint version_minor; 56 | std::string platform_name; 57 | std::string device_name; 58 | cl_command_queue queue = nullptr; 59 | cl_context ctx = nullptr; 60 | #ifdef CL_VERSION_2_0 61 | cl_device_svm_capabilities svm_caps; 62 | #endif 63 | bool is_intel_fpga = false; 64 | bool is_xilinx_fpga = false; 65 | 66 | std::unordered_map programs; 67 | std::unordered_map kernels; 68 | std::unordered_map kernels_queue; 69 | 70 | // Atomics do not have a move constructor. This structure introduces one. 71 | struct AtomicData { 72 | std::atomic_int timings_counter {}; 73 | std::atomic_flag lock = ATOMIC_FLAG_INIT; 74 | AtomicData() = default; 75 | AtomicData(AtomicData&&) {} 76 | } atomic_data; 77 | 78 | DeviceData( 79 | OpenCLPlatform* parent, 80 | cl_platform_id platform, 81 | cl_device_id dev, 82 | cl_uint version_major, 83 | cl_uint version_minor, 84 | const std::string& platform_name, 85 | const std::string& device_name) 86 | : parent(parent) 87 | , platform(platform) 88 | , dev(dev) 89 | , version_major(version_major) 90 | , version_minor(version_minor) 91 | , platform_name(platform_name) 92 | , device_name(device_name) 93 | {} 94 | DeviceData(DeviceData&&) = default; 95 | DeviceData(const DeviceData&) = delete; 96 | 97 | void lock() { 98 | while (atomic_data.lock.test_and_set(std::memory_order_acquire)) ; 99 | } 100 | 101 | void unlock() { 102 | atomic_data.lock.clear(std::memory_order_release); 103 | } 104 | }; 105 | 106 | std::vector devices_; 107 | 108 | cl_kernel load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname); 109 | cl_program load_program_binary(DeviceId dev, const std::string& filename, const std::string& program_string) const; 110 | cl_program load_program_il(DeviceId dev, const std::string& filename, const std::string& program_string) const; 111 | cl_program load_program_source(DeviceId dev, const std::string& filename, const std::string& program_string) const; 112 | cl_program compile_program(DeviceId dev, cl_program program, const std::string& filename) const; 113 | 114 | friend void time_kernel_callback(cl_event, cl_int, void*); 115 | }; 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /src/pal/pal_device.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_DEVICE_DATA_H 2 | #define PAL_DEVICE_DATA_H 3 | 4 | #include "../runtime.h" 5 | #include "pal_utils.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | class PALPlatform; 19 | 20 | class PalDevice { 21 | public: 22 | typedef Pal::gpusize GpuVirtAddr_t; 23 | typedef std::unordered_map KernelMap; 24 | 25 | enum class queue_and_cmd_buffer_type { Compute, Universal, Dma }; 26 | 27 | PalDevice(){}; 28 | PalDevice(Pal::IDevice* base_device, Runtime* runtime); 29 | PalDevice(const PalDevice&) = delete; 30 | PalDevice(PalDevice&& other) 31 | : runtime_(other.runtime_) 32 | , device_(other.device_) 33 | , cmd_allocator_(other.cmd_allocator_) 34 | , queue_(other.queue_) 35 | , cmd_buffer_(other.cmd_buffer_) 36 | , profiling_timestamps_(other.profiling_timestamps_) 37 | , timestamps_frequency_(other.timestamps_frequency_) 38 | , programs_(std::move(other.programs_)) 39 | , kernels_(std::move(other.kernels_)) 40 | , memory_objects_(std::move(other.memory_objects_)) 41 | , gfx_level(other.gfx_level) 42 | , isa(std::move(other.isa)) 43 | , name(std::move(other.name)) {} 44 | 45 | ~PalDevice(); 46 | 47 | void lock() { 48 | while (locked_.test_and_set(std::memory_order_acquire)) 49 | ; 50 | } 51 | 52 | void unlock() { locked_.clear(std::memory_order_release); } 53 | 54 | Pal::IPipeline* create_pipeline(const void* elf_data, size_t elf_data_size); 55 | 56 | // Allocates memory of the requested size on the requested gpu heap (controls visibility). 57 | // Returns the virtual gpu address of the allocated memory. 58 | GpuVirtAddr_t allocate_gpu_memory(Pal::gpusize size_in_bytes, Pal::GpuHeap heap); 59 | 60 | GpuVirtAddr_t allocate_shared_virtual_memory(Pal::gpusize sizeInBytes); 61 | 62 | void release_gpu_memory(GpuVirtAddr_t virtual_address); 63 | void release_gpu_memory(void* virtual_address) { 64 | release_gpu_memory(reinterpret_cast(virtual_address)); 65 | } 66 | 67 | void copy_gpu_data( 68 | const GpuVirtAddr_t source, GpuVirtAddr_t destination, const Pal::MemoryCopyRegion& copy_region); 69 | void copy_gpu_data(const void* source, void* destination, const Pal::MemoryCopyRegion& copy_region) { 70 | copy_gpu_data(reinterpret_cast(source), 71 | reinterpret_cast(destination), copy_region); 72 | } 73 | 74 | void dispatch(const Pal::CmdBufferBuildInfo& cmd_buffer_build_info, 75 | const Pal::PipelineBindParams& pipeline_bind_params, const Pal::BarrierInfo& barrier_info, 76 | const LaunchParams& launch_params); 77 | 78 | void WaitIdle(); 79 | 80 | private: 81 | friend PALPlatform; 82 | 83 | Pal::Result init(); 84 | 85 | // Creates a PAL queue object and corresponding command buffer object into the given pointers. 86 | Pal::Result init_queue_and_cmd_buffer(queue_and_cmd_buffer_type type, Pal::IQueue*& queue, Pal::ICmdBuffer*& cmd_buffer); 87 | 88 | // Creates a PAL command allocator which is needed to allocate memory for all 89 | // command buffer objects. 90 | Pal::Result init_cmd_allocator(); 91 | 92 | Pal::Result allocate_memory(Pal::gpusize size_in_bytes, Pal::GpuHeap heap, 93 | Pal::IGpuMemory** gpu_memory_pp, Pal::gpusize alignment = 256 * 1024); 94 | 95 | // Returns the key to the new map entry. This key is the virtual gpu memory address. 96 | GpuVirtAddr_t track_memory(Pal::IGpuMemory* memory); 97 | void forget_memory(GpuVirtAddr_t gpu_address); 98 | // Returns nullptr if key is not present in memory_objects map. 99 | Pal::IGpuMemory* get_memory_object(const GpuVirtAddr_t gpu_address) const; 100 | Pal::IGpuMemory* get_memory_object(const void* gpu_address) const { 101 | return get_memory_object(reinterpret_cast(gpu_address)); 102 | } 103 | 104 | // Build a buffer holding the kernel arguments and upload to the GPU. 105 | // Returns the address of the buffer on the gpu. 106 | GpuVirtAddr_t build_kernargs_buffer(const ParamsArgs& params_args, int num_args, const char* kernel_name); 107 | 108 | // Helper function that allocates a gpu-only buffer of the given size and uploads the data written by the 109 | // write_callback 110 | PalDevice::GpuVirtAddr_t write_data_to_gpu( 111 | Pal::gpusize byte_size, std::function write_callback); 112 | 113 | uint32_t calculate_launch_params_size(const ParamsArgs& params_args, uint32_t num_args); 114 | // Write kernel arguments to memory. Returns the number of bytes occupied by the passed in kernel arguments. 115 | size_t write_launch_params(const ParamsArgs& params_args, uint32_t num_args, void* memory, size_t memory_size); 116 | 117 | private: 118 | Runtime* runtime_ = nullptr; 119 | 120 | Pal::IDevice* device_ = nullptr; 121 | Pal::ICmdAllocator* cmd_allocator_ = nullptr; 122 | Pal::IQueue* queue_ = nullptr; 123 | Pal::ICmdBuffer* cmd_buffer_ = nullptr; 124 | 125 | Pal::IQueue* dma_queue_ = nullptr; 126 | Pal::ICmdBuffer* dma_cmd_buffer_ = nullptr; 127 | 128 | struct ProfilingTimestamps { 129 | uint64_t start; 130 | uint64_t end; 131 | }; 132 | Pal::IGpuMemory* profiling_timestamps_ = nullptr; 133 | uint64_t timestamps_frequency_ = 0; 134 | 135 | std::atomic_flag locked_ = ATOMIC_FLAG_INIT; 136 | 137 | std::unordered_map programs_; 138 | std::unordered_map kernels_; 139 | 140 | // Map virtual addresses on the GPU to the PAL objects representing the memory. 141 | // This is needed because AnyDSL assumes it deals with gpu-legal addresses in its API. 142 | // However, to interact with PAL we need to have the wrapper objects at hand. 143 | // The IGpuMemory objects should not be used outside of this class. 144 | std::unordered_map memory_objects_; 145 | 146 | public: 147 | Pal::GfxIpLevel gfx_level; 148 | std::string isa; 149 | std::string name; 150 | }; 151 | 152 | #endif -------------------------------------------------------------------------------- /src/pal/pal_fix_calling_convention_pass.cpp: -------------------------------------------------------------------------------- 1 | #include "pal_fix_calling_convention_pass.h" 2 | #include "pal_utils.h" 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace llvm; 11 | 12 | void fix_calling_conv(Module* m, Function* f, std::unordered_set& traversed_functions) { 13 | if (traversed_functions.find(f) != traversed_functions.end()) { 14 | // already visited this function -> prevent recursive loop 15 | return; 16 | } 17 | 18 | traversed_functions.insert(f); 19 | f->addFnAttr(llvm::Attribute::AlwaysInline); 20 | 21 | // Find and inspect all function calls inside of this function 22 | for (auto& bb : *f) { 23 | for (auto& instruction : bb) { 24 | if (CallInst* call_inst = dyn_cast(&instruction)) { 25 | if (call_inst->getCallingConv() != CallingConv::AMDGPU_Gfx) { 26 | call_inst->setCallingConv(CallingConv::AMDGPU_Gfx); 27 | } 28 | 29 | if (Function* called_function = call_inst->getCalledFunction()) { 30 | fix_calling_conv(m, called_function, traversed_functions); 31 | } 32 | } 33 | } 34 | } 35 | } 36 | 37 | PreservedAnalyses PalPlatformFixCallingConventionPass::run(Module& M, ModuleAnalysisManager&) { 38 | std::unordered_set traversed_functions = {}; 39 | for (Function& entrypoint_fn : M) { 40 | fix_calling_conv(&M, &entrypoint_fn, traversed_functions); 41 | } 42 | return PreservedAnalyses::all(); 43 | } 44 | -------------------------------------------------------------------------------- /src/pal/pal_fix_calling_convention_pass.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_PLATFORM_FIX_CALLING_CONVENTION_H 2 | #define PAL_PLATFORM_FIX_CALLING_CONVENTION_H 3 | 4 | #include 5 | 6 | /// This pass sets the calling convention to AMDGPU_Gfx for all calls in the given module and sets the AlwaysInline 7 | /// Attribute on every called function in the module to avoid the LLVM AMDGPU backend throwing errors. 8 | struct PalPlatformFixCallingConventionPass : llvm::PassInfoMixin { 9 | llvm::PreservedAnalyses run(llvm::Module& M, llvm::ModuleAnalysisManager&); 10 | }; 11 | 12 | #endif // PAL_PLATFORM_FIX_CALLING_CONVENTION_H -------------------------------------------------------------------------------- /src/pal/pal_insert_halt_pass.cpp: -------------------------------------------------------------------------------- 1 | #include "pal_insert_halt_pass.h" 2 | #include "pal_utils.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | using namespace llvm; 12 | 13 | PreservedAnalyses PalPlatformInsertHaltPass::run(Function& F, FunctionAnalysisManager&) { 14 | char* halt_immediately = std::getenv("HALT_IMMEDIATELY"); 15 | if (F.getName() != pal_utils::ComputeShaderMainFnName || !halt_immediately 16 | || strcmp(halt_immediately, "ON") != 0) { 17 | return PreservedAnalyses::all(); 18 | } 19 | assert(F.getCallingConv() == CallingConv::AMDGPU_CS); 20 | LLVMContext& Ctx = F.getParent()->getContext(); 21 | BasicBlock& EntryBlock = *F.begin(); 22 | IRBuilder<> Builder(&(*EntryBlock.getFirstInsertionPt())); 23 | ArrayRef inline_asm_args; 24 | InlineAsm* inline_assembly = InlineAsm::get( 25 | FunctionType::get(Type::getVoidTy(Ctx), false), "s_sethalt 1", "", true, false, InlineAsm::AD_ATT); 26 | Builder.CreateCall(inline_assembly, inline_asm_args); 27 | return PreservedAnalyses::none(); 28 | } -------------------------------------------------------------------------------- /src/pal/pal_insert_halt_pass.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_PLATFORM_INSERT_HALT_PASS_H 2 | #define PAL_PLATFORM_INSERT_HALT_PASS_H 3 | 4 | #include 5 | #include 6 | 7 | /// Pass that inserts RDNA specific assembly to halt a shader as soon as it starts if the environment variable 8 | /// "HALT_IMMEDIATELY" is set to the value "ON" 9 | struct PalPlatformInsertHaltPass : llvm::PassInfoMixin { 10 | llvm::PreservedAnalyses run(llvm::Function& F, llvm::FunctionAnalysisManager& FAM); 11 | }; 12 | 13 | #endif // PAL_PLATFORM_INSERT_HALT_PASS_H -------------------------------------------------------------------------------- /src/pal/pal_lower_builtins_pass.cpp: -------------------------------------------------------------------------------- 1 | #include "pal_lower_builtins_pass.h" 2 | #include "pal_utils.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace llvm; 18 | 19 | namespace { 20 | // anonymous namespace to avoid name clashes 21 | 22 | enum Builtins : int8_t { 23 | workitem_id_x = 0, 24 | workitem_id_y, 25 | workitem_id_z, 26 | workgroup_id_x, 27 | workgroup_id_y, 28 | workgroup_id_z, 29 | nblk_x, 30 | nblk_y, 31 | nblk_z, 32 | // Dynamic builtins (i.e., inlined code based on supplied metadata) 33 | workgroup_size_x, 34 | workgroup_size_y, 35 | workgroup_size_z, 36 | count 37 | }; 38 | 39 | constexpr const char* BuiltinNames[] = { 40 | "anydsl.amdpal.workitem.id.x", 41 | "anydsl.amdpal.workitem.id.y", 42 | "anydsl.amdpal.workitem.id.z", 43 | "anydsl.amdpal.workgroup.id.x", 44 | "anydsl.amdpal.workgroup.id.y", 45 | "anydsl.amdpal.workgroup.id.z", 46 | "anydsl.amdpal.nblk.x", 47 | "anydsl.amdpal.nblk.y", 48 | "anydsl.amdpal.nblk.z", 49 | // Dynamic builtins (i.e., inlined code based on supplied metadata) 50 | "anydsl.amdpal.workgroup.size.x", 51 | "anydsl.amdpal.workgroup.size.y", 52 | "anydsl.amdpal.workgroup.size.z", 53 | }; 54 | 55 | struct BuiltinAssemblyInfo { 56 | const char* asmString; 57 | const char* asmConstraints; 58 | }; 59 | 60 | // PAL SGPR layout: 61 | // s0-1: PAL reserved data -> set up by PAL because of pipeline register configuration in PALPlatform 62 | // s2-3: pointer to pal kernel args (for compute shader) 63 | // -> set up by AnyDSL PALPlatform 64 | // s4-5: pointer to NumWorkGroups struct (i.e., nblk) 65 | // s6-12: reserved for future use 66 | // s13-15: work group id x, y, and z -> set up by AnyDSL PALPlatform by supplying pgm_rsrc2 67 | // ENABLE_SGPR_WORKGROUP_ID_ to PAL pipeline setup 68 | 69 | const BuiltinAssemblyInfo BuiltinAssemblyInfos[]{ 70 | // workitem_id_x 71 | { 72 | .asmString = "; local thread id x is in v0", 73 | .asmConstraints = "={v0}", 74 | }, 75 | // workitem_id_y 76 | { 77 | .asmString = "; local thread id y is in v1", 78 | .asmConstraints = "={v1}", 79 | }, 80 | // workitem_id_z 81 | { 82 | .asmString = "; local thread id z is in v2", 83 | .asmConstraints = "={v2}", 84 | }, 85 | // workgroup_id_x 86 | { 87 | .asmString = "; workgroup id x is in s13", 88 | .asmConstraints = "={s13}", 89 | }, 90 | // workgroup_id_y 91 | { 92 | .asmString = "; workgroup id y is in s14", 93 | .asmConstraints = "={s14}", 94 | }, 95 | // workgroup_id_z 96 | { 97 | .asmString = "; workgroup id z is in s15", 98 | .asmConstraints = "={s15}", 99 | }, 100 | // nblk_x 101 | { 102 | .asmString = "s_load_dword $0, s[4:5], 0x00", 103 | .asmConstraints = "=s", 104 | }, 105 | // nblk_y 106 | { 107 | .asmString = "s_load_dword $0, s[4:5], 0x04", 108 | .asmConstraints = "=s", 109 | }, 110 | // nblk_z 111 | { 112 | .asmString = "s_load_dword $0, s[4:5], 0x08", 113 | .asmConstraints = "=s", 114 | }, 115 | }; 116 | 117 | typedef std::array, static_cast(Builtins::count)> BuiltinsCallInstMap; 118 | 119 | Builtins GetBuiltinID(Function* f) { 120 | const StringRef f_name = f->getName(); 121 | for (int8_t i = 0; i < Builtins::count; ++i) { 122 | if (f_name == BuiltinNames[i]) { 123 | return Builtins(i); 124 | } 125 | } 126 | return Builtins::count; 127 | } 128 | 129 | const BuiltinAssemblyInfo& GetAssemblyInfo(Builtins builtinID) { 130 | return BuiltinAssemblyInfos[static_cast(builtinID)]; 131 | } 132 | 133 | bool IsBuiltin(Function* f) { return GetBuiltinID(f) < Builtins::count; } 134 | 135 | void find_builtins_calls(Module* m, Function* f, BuiltinsCallInstMap& builtins_call_instances, 136 | std::unordered_set& traversed_functions) { 137 | if (traversed_functions.find(f) != traversed_functions.end()) { 138 | // already visited this function -> prevent recursive loop 139 | return; 140 | } 141 | 142 | traversed_functions.insert(f); 143 | 144 | // Find and inspect all function calls inside of this function 145 | for (auto& bb : *f) { 146 | for (auto& instruction : bb) { 147 | CallInst* callInst = dyn_cast(&instruction); 148 | if (!callInst) { 149 | continue; 150 | } 151 | 152 | Function* calledFunction = callInst->getCalledFunction(); 153 | if (!calledFunction) { 154 | continue; 155 | } 156 | 157 | if (IsBuiltin(calledFunction)) { 158 | // If the call we found is calling a builtin, record the builtins usage 159 | Builtins builtinID = GetBuiltinID(calledFunction); 160 | builtins_call_instances[static_cast(builtinID)].push_back(callInst); 161 | } else if (calledFunction->getParent() == m) { 162 | // If the called function is within this module, recursively search it for 163 | // builtins used 164 | find_builtins_calls(m, calledFunction, builtins_call_instances, traversed_functions); 165 | } 166 | } 167 | } 168 | } 169 | 170 | Function* find_entrypoint(Module& M) { 171 | for (Function& F : M) { 172 | const auto name = F.getName(); 173 | if (name.equals(pal_utils::ComputeShaderMainFnName)) 174 | return &F; 175 | } 176 | 177 | return nullptr; 178 | } 179 | 180 | // Function taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp 181 | BasicBlock::iterator getInsertPt(BasicBlock& BB) { 182 | BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); 183 | for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { 184 | AllocaInst* AI = dyn_cast(&*InsPt); 185 | 186 | // If this is a dynamic alloca, the value may depend on the loaded kernargs, 187 | // so loads will need to be inserted before it. 188 | if (!AI || !AI->isStaticAlloca()) 189 | break; 190 | } 191 | 192 | return InsPt; 193 | } 194 | 195 | CallInst* insert_asm( 196 | IRBuilder<>& Builder, LLVMContext& Ctx, const char* asm_string, const char* asm_constraint) { 197 | ArrayRef inline_asm_args; 198 | InlineAsm* inline_assembly = InlineAsm::get(FunctionType::get(Type::getInt32Ty(Ctx), false), asm_string, 199 | asm_constraint, true, false, InlineAsm::AD_ATT); 200 | return Builder.CreateCall(inline_assembly, inline_asm_args); 201 | } 202 | 203 | // Inserts assembly code to split the local thread id from v0 into v0(x), v1(y) and v2(z). 204 | // This is only applicable for GPUs >= gfx 11. 205 | void insert_asm_to_split_local_thread_id(IRBuilder<>& Builder, LLVMContext& Ctx, 206 | const BuiltinsCallInstMap& builtins_call_instances, Pal::GfxIpLevel gfx_level) { 207 | assert(gfx_level >= Pal::GfxIpLevel::GfxIp11_0); 208 | // Write local thread id z into v2. 209 | if (!builtins_call_instances[Builtins::workitem_id_z].empty()) { 210 | insert_asm(Builder, Ctx, 211 | "; def v2 local thread id z is in v0[29:20] (v0[31:30] set to 0 by hardware)\n\t" 212 | "V_LSHRREV_B32 v2 20 v0", 213 | "={v2}"); 214 | } 215 | // Write local thread id y into v1. 216 | if (!builtins_call_instances[Builtins::workitem_id_y].empty()) { 217 | insert_asm(Builder, Ctx, 218 | "; def v1 local thread id y is in v0[19:10]\n\t" 219 | "V_LSHRREV_B32 v1 10 v0\n\t" 220 | "V_AND_B32 v1 v1 0x3FF", 221 | "={v1}"); 222 | } 223 | // Write local thread id x into v0 last to make sure v0 is not overwritten yet. 224 | if (!builtins_call_instances[Builtins::workitem_id_x].empty()) { 225 | insert_asm(Builder, Ctx, 226 | "; def v0 local thread id x is in v0[9:0]\n\t" 227 | "V_AND_B32 v0 v0 0x3FF", 228 | "={v0}"); 229 | } 230 | } 231 | 232 | } // namespace 233 | 234 | PreservedAnalyses PalPlatformLowerBuiltinsPass::run(Module& M, ModuleAnalysisManager&) { 235 | Function* entrypoint_fn = find_entrypoint(M); 236 | assert(entrypoint_fn); 237 | 238 | /* 239 | Find all calls to builtins and unique them 240 | -> i.e. every builtin is only called exactly once right at the beginning of the shader. 241 | 242 | for each instruction in entrypoint: 243 | if call to builtin: 244 | record builtin (unique set of used_builtins + all separate calls to them!) 245 | elif call to another function inside this module: 246 | recursively find all calls of used built_ins 247 | else: don't care 248 | 249 | for each used_builtin: 250 | Value* real_builtin = insert inline_asm at beginning of entrypoint 251 | for each call instance of the builtin: 252 | replace all uses of call instance with real_builtin 253 | remove old call instance 254 | */ 255 | 256 | BuiltinsCallInstMap builtins_call_instances; 257 | std::unordered_set traversed_functions = {}; 258 | find_builtins_calls(&M, entrypoint_fn, builtins_call_instances, traversed_functions); 259 | 260 | LLVMContext& Ctx = M.getContext(); 261 | BasicBlock& EntryBlock = *entrypoint_fn->begin(); 262 | IRBuilder<> Builder(&*getInsertPt(EntryBlock)); 263 | 264 | if (gfx_level_ >= Pal::GfxIpLevel::GfxIp11_0) { 265 | insert_asm_to_split_local_thread_id(Builder, Ctx, builtins_call_instances, gfx_level_); 266 | } 267 | 268 | int builtins_count = static_cast(Builtins::count); 269 | for (int i = 0; i < builtins_count; ++i) { 270 | const Builtins builtin_id = Builtins(i); 271 | const std::vector builtin_call_instances = builtins_call_instances[i]; 272 | if (builtin_call_instances.empty()) { 273 | continue; 274 | } 275 | 276 | CallInst* lowered_unique_builtin = nullptr; 277 | switch (builtin_id) { 278 | case Builtins::workgroup_size_x: 279 | lowered_unique_builtin = insert_asm(Builder, Ctx, 280 | ("s_mov_b32 $0, " + std::to_string(tg_dims_[0]) + "; workgroup size x").c_str(), "=s"); 281 | break; 282 | case Builtins::workgroup_size_y: 283 | lowered_unique_builtin = insert_asm(Builder, Ctx, 284 | ("s_mov_b32 $0, " + std::to_string(tg_dims_[1]) + "; workgroup size y").c_str(), "=s"); 285 | break; 286 | case Builtins::workgroup_size_z: 287 | lowered_unique_builtin = insert_asm(Builder, Ctx, 288 | ("s_mov_b32 $0, " + std::to_string(tg_dims_[2]) + "; workgroup size z").c_str(), "=s"); 289 | break; 290 | default: 291 | const auto& assemblyInfo = GetAssemblyInfo(builtin_id); 292 | lowered_unique_builtin = 293 | insert_asm(Builder, Ctx, assemblyInfo.asmString, assemblyInfo.asmConstraints); 294 | } 295 | 296 | for (CallInst* call_to_builtin : builtin_call_instances) { 297 | call_to_builtin->replaceAllUsesWith(lowered_unique_builtin); 298 | } 299 | } 300 | 301 | for (int i = 0; i < static_cast(builtins_count); ++i) { 302 | const std::vector builtin_call_instances = builtins_call_instances[i]; 303 | for (CallInst* call_to_builtin : builtin_call_instances) { 304 | call_to_builtin->eraseFromParent(); 305 | } 306 | } 307 | // All uncalled functions from the module have to be removed because any kernels other than the one 308 | // marked as entrypoint may contain calls to builtins which have not been resolved by this pass but 309 | // may trip up linkers/relocations. Therefore we set all functions to internal linkage, except the 310 | // known entrypoint. This way, the global dead code elimination pass can remove them for us. 311 | for (Function& F : M) { 312 | if (F.getName().startswith("llvm")) { 313 | // Don't mark llvm intrinsics as internal linkage, otherwise they get 314 | // altered/removed which breaks backend codegen. 315 | continue; 316 | } 317 | F.setLinkage(GlobalValue::LinkageTypes::InternalLinkage); 318 | } 319 | entrypoint_fn->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage); 320 | 321 | return PreservedAnalyses::none(); 322 | } -------------------------------------------------------------------------------- /src/pal/pal_lower_builtins_pass.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_PLATFORM_LOWER_BUILTINS_H 2 | #define PAL_PLATFORM_LOWER_BUILTINS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | /// This pass takes care of replacing calls to so-called "builtins" (i.e. local/global thread indices, and similar 14 | /// compute shader builtin values) with the appropriate amdgpu inline assembly that extracts the values from 15 | /// prepopulated SGPRs according to the RDNA2 or RDNA3 ABI. This pass only supports gfx-levels 10 and 11. 16 | struct PalPlatformLowerBuiltinsPass : llvm::PassInfoMixin { 17 | PalPlatformLowerBuiltinsPass( 18 | Pal::GfxIpLevel gfx_level, std::array tg_dims) 19 | : gfx_level_(gfx_level) 20 | , tg_dims_(tg_dims) {} 21 | 22 | PalPlatformLowerBuiltinsPass(const PalPlatformLowerBuiltinsPass& other) = default; 23 | PalPlatformLowerBuiltinsPass& operator=(const PalPlatformLowerBuiltinsPass& other) = default; 24 | PalPlatformLowerBuiltinsPass(PalPlatformLowerBuiltinsPass&& other) = default; 25 | PalPlatformLowerBuiltinsPass& operator=(PalPlatformLowerBuiltinsPass&& other) = default; 26 | ~PalPlatformLowerBuiltinsPass() = default; 27 | 28 | llvm::PreservedAnalyses run(llvm::Module& M, llvm::ModuleAnalysisManager&); 29 | 30 | private: 31 | Pal::GfxIpLevel gfx_level_; 32 | std::array tg_dims_; 33 | }; 34 | 35 | #endif // PAL_PLATFORM_LOWER_BUILTINS_H -------------------------------------------------------------------------------- /src/pal/pal_lower_kernel_arguments_pass.cpp: -------------------------------------------------------------------------------- 1 | #include "pal_lower_kernel_arguments_pass.h" 2 | #include "pal_utils.h" 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace llvm; 15 | 16 | namespace { 17 | // anonymous namespace to avoid name clashes 18 | 19 | // Function taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp 20 | BasicBlock::iterator getInsertPt(BasicBlock& BB) { 21 | BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); 22 | for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { 23 | AllocaInst* AI = dyn_cast(&*InsPt); 24 | 25 | // If this is a dynamic alloca, the value may depend on the loaded kernargs, 26 | // so loads will need to be inserted before it. 27 | if (!AI || !AI->isStaticAlloca()) 28 | break; 29 | } 30 | 31 | return InsPt; 32 | } 33 | 34 | // Function based on AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, Align &MaxAlign) 35 | // Taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp 36 | uint64_t getExplicitKernArgSize(const Function& F, Align& MaxAlign) { 37 | assert(F.getCallingConv() == CallingConv::AMDGPU_CS); 38 | 39 | const DataLayout& DL = F.getParent()->getDataLayout(); 40 | uint64_t ExplicitArgBytes = 0; 41 | MaxAlign = Align(1); 42 | 43 | for (const Argument& Arg : F.args()) { 44 | const bool IsByRef = Arg.hasByRefAttr(); 45 | Type* ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 46 | MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; 47 | Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); 48 | 49 | uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 50 | ExplicitArgBytes = alignTo(ExplicitArgBytes, ABITypeAlign) + AllocSize; 51 | MaxAlign = std::max(MaxAlign, ABITypeAlign); 52 | } 53 | 54 | return ExplicitArgBytes; 55 | } 56 | 57 | // Function based on AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, Align &MaxAlign) 58 | // Taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp 59 | unsigned getKernArgSegmentSize(const Function& F, Align& MaxAlign) { 60 | uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 61 | unsigned ExplicitOffset = 0; 62 | // Being able to dereference past the end is useful for emitting scalar loads. 63 | return alignTo(ExplicitOffset + ExplicitArgBytes, 4); 64 | } 65 | } // namespace 66 | 67 | // Largely based on the function AMDGPULowerKernelArguments::runOnFunction(Function &F) 68 | // taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp 69 | // Minor adaptations added to satisfy the AnyDSL PALPlatform requirements. 70 | PreservedAnalyses PalPlatformLowerKernelArgumentsPass::run(Function& F, FunctionAnalysisManager&) { 71 | const auto& funcname = F.getName(); 72 | if (funcname != pal_utils::ComputeShaderMainFnName || F.arg_empty()) { 73 | // Only the entry point function's parameters are kernel arguments that need to be lowered. 74 | return PreservedAnalyses::all(); 75 | } 76 | assert(F.getCallingConv() == CallingConv::AMDGPU_CS); 77 | 78 | LLVMContext& Ctx = F.getParent()->getContext(); 79 | const DataLayout& DL = F.getParent()->getDataLayout(); 80 | BasicBlock& EntryBlock = *F.begin(); 81 | IRBuilder<> Builder(&*getInsertPt(EntryBlock)); 82 | 83 | const Align KernArgBaseAlign(16); // FIXME: Increase if necessary 84 | const uint64_t BaseOffset = 0; // We don't have any data preceding the kernel arguments 85 | 86 | Align MaxAlign; 87 | // TODO: We have to extract that from the Function arguments ourselves! 88 | const uint64_t TotalKernArgSize = getKernArgSegmentSize(F, MaxAlign); 89 | if (TotalKernArgSize == 0) 90 | return PreservedAnalyses::all(); 91 | 92 | // Generate Our own ISA to get the pointer to the buffer containing the kernel arguments 93 | // PALPlatform ensures that registers s[2:3] contain this address when the kernel starts execution 94 | std::string asmString = std::string("; def $0 pointer to buffer containing the kernel args is set up in s[2:3]"); 95 | // Constraints reference: https://llvm.org/docs/LangRef.html#inline-asm-constraint-string 96 | // This constraint states that our inline assembly returns ("="-prefix indicates constraint for output) 97 | // its result in sgprs 2-3 98 | StringRef constraints = "={s[2:3]}"; 99 | ArrayRef inline_asm_args = std::nullopt; 100 | 101 | // Value taken from AMDGPU.h (namespace AMDGPUAS) 102 | // global address space pointing to memory that won't change during execution 103 | unsigned CONSTANT_ADDRESS = 4; 104 | InlineAsm* inline_assembly = 105 | InlineAsm::get(FunctionType::get(Type::getInt8PtrTy(Ctx, CONSTANT_ADDRESS), false), asmString.c_str(), 106 | constraints, true, false, InlineAsm::AD_ATT); 107 | CallInst* KernArgSegment = Builder.CreateCall(inline_assembly, inline_asm_args); 108 | 109 | KernArgSegment->addRetAttr(Attribute::NonNull); 110 | KernArgSegment->addRetAttr(Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); 111 | unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); 112 | 113 | uint64_t ExplicitArgOffset = 0; 114 | 115 | for (Argument& Arg : F.args()) { 116 | const bool IsByRef = Arg.hasByRefAttr(); 117 | Type* ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 118 | MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; 119 | Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); 120 | 121 | uint64_t Size = DL.getTypeSizeInBits(ArgTy); 122 | uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 123 | 124 | uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; 125 | ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; 126 | 127 | if (Arg.use_empty()) 128 | continue; 129 | 130 | // If this is byval, the loads are already explicit in the function. We just 131 | // need to rewrite the pointer values. 132 | if (IsByRef) { 133 | Value* ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( 134 | Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".byval.kernarg.offset"); 135 | 136 | Value* CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(ArgOffsetPtr, Arg.getType()); 137 | Arg.replaceAllUsesWith(CastOffsetPtr); 138 | continue; 139 | } 140 | 141 | if (PointerType* PT = dyn_cast(ArgTy)) { 142 | // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing 143 | // modes on SI to know the high bits are 0 so pointer adds don't wrap. We 144 | // can't represent this with range metadata because it's only allowed for 145 | // integer types. 146 | 147 | // Values taken from AMDGPU.h (namespace AMDGPUAS) 148 | const unsigned REGION_ADDRESS = 2; ///< Address space for region memory. (GDS) 149 | const unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory. 150 | if ((PT->getAddressSpace() == LOCAL_ADDRESS || PT->getAddressSpace() == REGION_ADDRESS)) 151 | continue; 152 | 153 | // FIXME: We can replace this with equivalent alias.scope/noalias 154 | // metadata, but this appears to be a lot of work. 155 | if (Arg.hasNoAliasAttr()) 156 | continue; 157 | } 158 | 159 | auto* VT = dyn_cast(ArgTy); 160 | bool IsV3 = VT && VT->getNumElements() == 3; 161 | bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); 162 | 163 | VectorType* V4Ty = nullptr; 164 | 165 | int64_t AlignDownOffset = alignDown(EltOffset, 4); 166 | int64_t OffsetDiff = EltOffset - AlignDownOffset; 167 | Align AdjustedAlign = commonAlignment(KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); 168 | 169 | Value* ArgPtr; 170 | Type* AdjustedArgTy; 171 | if (DoShiftOpt) { // FIXME: Handle aggregate types 172 | // Since we don't have sub-dword scalar loads, avoid doing an extload by 173 | // loading earlier than the argument address, and extracting the relevant 174 | // bits. 175 | // 176 | // Additionally widen any sub-dword load to i32 even if suitably aligned, 177 | // so that CSE between different argument loads works easily. 178 | ArgPtr = Builder.CreateConstInBoundsGEP1_64(Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, 179 | Arg.getName() + ".kernarg.offset.align.down"); 180 | AdjustedArgTy = Builder.getInt32Ty(); 181 | } else { 182 | ArgPtr = Builder.CreateConstInBoundsGEP1_64( 183 | Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".kernarg.offset"); 184 | AdjustedArgTy = ArgTy; 185 | } 186 | 187 | if (IsV3 && Size >= 32) { 188 | V4Ty = FixedVectorType::get(VT->getElementType(), 4); 189 | // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads 190 | AdjustedArgTy = V4Ty; 191 | } 192 | 193 | ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); 194 | LoadInst* Load = Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); 195 | Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); 196 | 197 | MDBuilder MDB(Ctx); 198 | 199 | if (isa(ArgTy)) { 200 | if (Arg.hasNonNullAttr()) 201 | Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); 202 | 203 | uint64_t DerefBytes = Arg.getDereferenceableBytes(); 204 | if (DerefBytes != 0) { 205 | Load->setMetadata(LLVMContext::MD_dereferenceable, 206 | MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); 207 | } 208 | 209 | uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); 210 | if (DerefOrNullBytes != 0) { 211 | Load->setMetadata(LLVMContext::MD_dereferenceable_or_null, 212 | MDNode::get( 213 | Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefOrNullBytes)))); 214 | } 215 | 216 | auto ParamMaybeAlign = Arg.getParamAlign(); 217 | if (ParamMaybeAlign.has_value()) { 218 | Load->setMetadata(LLVMContext::MD_align, 219 | MDNode::get(Ctx, MDB.createConstant(ConstantInt::get( 220 | Builder.getInt64Ty(), ParamMaybeAlign.valueOrOne().value())))); 221 | } 222 | } 223 | 224 | // TODO: Convert noalias arg to !noalias 225 | 226 | if (DoShiftOpt) { 227 | Value* ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); 228 | 229 | IntegerType* ArgIntTy = Builder.getIntNTy(Size); 230 | Value* Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); 231 | Value* NewVal = Builder.CreateBitCast(Trunc, ArgTy, Arg.getName() + ".load"); 232 | Arg.replaceAllUsesWith(NewVal); 233 | } else if (IsV3) { 234 | Value* Shuf = Builder.CreateShuffleVector(Load, ArrayRef{0, 1, 2}, Arg.getName() + ".load"); 235 | Arg.replaceAllUsesWith(Shuf); 236 | } else { 237 | Load->setName(Arg.getName() + ".load"); 238 | Arg.replaceAllUsesWith(Load); 239 | } 240 | } 241 | 242 | KernArgSegment->addRetAttr(Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); 243 | 244 | return PreservedAnalyses::none(); 245 | } -------------------------------------------------------------------------------- /src/pal/pal_lower_kernel_arguments_pass.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H 2 | #define PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H 3 | 4 | #include 5 | 6 | /// This pass replaces accesses to kernel arguments with loads from offsets from a manually supplied buffer 7 | /// containing these arguments. The pointer to this buffer is expected to be prepopulated into specific sgprs 8 | /// by the PALPlatform. 9 | /// 10 | /// This pass is an almost 1:1 replicate of the AMDGPULowerKernelArguments pass 11 | /// (llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp) 12 | struct PalPlatformLowerKernelArgumentsPass : llvm::PassInfoMixin { 13 | PalPlatformLowerKernelArgumentsPass(){} 14 | 15 | PalPlatformLowerKernelArgumentsPass(const PalPlatformLowerKernelArgumentsPass& other) = default; 16 | PalPlatformLowerKernelArgumentsPass& operator=(const PalPlatformLowerKernelArgumentsPass& other) = default; 17 | PalPlatformLowerKernelArgumentsPass(PalPlatformLowerKernelArgumentsPass&& other) = default; 18 | PalPlatformLowerKernelArgumentsPass& operator=(PalPlatformLowerKernelArgumentsPass&& other) = default; 19 | ~PalPlatformLowerKernelArgumentsPass() = default; 20 | 21 | llvm::PreservedAnalyses run(llvm::Function& F, llvm::FunctionAnalysisManager& FAM); 22 | }; 23 | 24 | #endif // PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H -------------------------------------------------------------------------------- /src/pal/pal_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_UTILS_H 2 | #define PAL_UTILS_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace pal_utils { 15 | 16 | std::string llvm_diagnostic_to_string(const llvm::SMDiagnostic& diagnostic_err); 17 | 18 | struct ShaderSrc { 19 | const std::string kernelname; 20 | const std::string src_code; 21 | const std::string filename; 22 | const llvm::Function* function; 23 | llvm::LLVMContext llvm_context; 24 | std::unique_ptr llvm_module; 25 | llvm::SMDiagnostic diagnostic_err; 26 | 27 | ShaderSrc(const std::string& filename, const std::string& src_code, const std::string& kernelname); 28 | bool rename_entry_point(); 29 | }; 30 | 31 | // Create the metadata that PAL expects to be attached to a kernel/shader binary. 32 | llvm::msgpack::Document build_metadata(const ShaderSrc& shader_src, Pal::GfxIpLevel gfx_level, 33 | const std::array& thread_group_dimensions, uint32_t wavefront_size); 34 | 35 | const char* get_gpu_name(const Pal::AsicRevision asic_revision); 36 | 37 | const char* get_gfx_isa_id(const Pal::GfxIpLevel gfxip_level); 38 | 39 | bool isAMDGPUEntryFunctionCC(llvm::CallingConv::ID CC); 40 | 41 | void write_to_memory( 42 | Pal::IGpuMemory* dst_memory, int64_t dst_memory_offset, const void* src_data, int64_t size); 43 | void read_from_memory(void* dst_buffer, Pal::IGpuMemory* src_memory, int64_t src_memory_offset, int64_t size); 44 | 45 | // Returns a gpu-local memory heap that fits memory_size. 46 | // Order of importance: 1.GpuHeapInvisible, 2.GpuHeapLocal 47 | // Returns Pal::GpuHeap::GpuHeapCount if no appropriate heap can be found. 48 | Pal::GpuHeap find_gpu_local_heap(const Pal::IDevice* device, Pal::gpusize memory_size); 49 | 50 | bool allocation_is_host_visible(Pal::IGpuMemory* gpu_allocation); 51 | 52 | llvm::MDNode* get_metadata_mdnode(const llvm::Function* func, const char* key, int index = 0); 53 | llvm::StringRef get_metadata_string(const llvm::Function* func, const char* key); 54 | uint64_t get_metadata_uint(const llvm::Function* func, const char* key, int index = 0); 55 | 56 | extern const char* ComputeShaderMainFnName; 57 | 58 | } // namespace pal_utils 59 | 60 | #define CHECK_PAL(err, name) { if (err != Pal::Result::Success) { error("PAL API function % [file %, line %]: %", name, __FILE__, __LINE__, static_cast(err)); } } 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /src/pal_platform.h: -------------------------------------------------------------------------------- 1 | #ifndef PAL_PLATFORM_H 2 | #define PAL_PLATFORM_H 3 | 4 | #include "pal/pal_device.h" 5 | #include "pal/pal_utils.h" 6 | #include "platform.h" 7 | #include "runtime.h" 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #ifdef AnyDSL_runtime_HAS_LLVM_SUPPORT 22 | #include 23 | #endif 24 | 25 | class PALPlatform : public Platform { 26 | public: 27 | PALPlatform(Runtime* runtime); 28 | ~PALPlatform(); 29 | 30 | protected: 31 | void* alloc(DeviceId dev, int64_t size) override; 32 | void* alloc_host(DeviceId dev, int64_t size) override; 33 | void* alloc_unified(DeviceId dev, int64_t size) override; 34 | void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); } 35 | void release(DeviceId dev, void* ptr) override; 36 | void release_host(DeviceId dev, void* ptr) override; 37 | 38 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; 39 | 40 | void synchronize(DeviceId dev) override; 41 | 42 | void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, 43 | int64_t size) override; 44 | void copy_from_host( 45 | const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override; 46 | void copy_to_host( 47 | DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; 48 | 49 | size_t dev_count() const override { return devices_.size(); } 50 | std::string name() const override { return "PAL"; } 51 | const char* device_name(DeviceId dev) const override; 52 | bool device_check_feature_support(DeviceId, const char*) const override { return false; } 53 | 54 | Pal::IPipeline* load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname); 55 | std::string compile_gcn(DeviceId dev, pal_utils::ShaderSrc&& shader_src) const; 56 | std::string emit_gcn(pal_utils::ShaderSrc&& shader_src, const std::string& cpu, 57 | Pal::GfxIpLevel gfx_level, llvm::OptimizationLevel opt) const; 58 | 59 | protected: 60 | Pal::IPlatform* platform_; 61 | std::vector devices_; 62 | }; 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /src/platform.h: -------------------------------------------------------------------------------- 1 | #ifndef PLATFORM_H 2 | #define PLATFORM_H 3 | 4 | #include "anydsl_runtime_config.h" 5 | #include "log.h" 6 | #include "runtime.h" 7 | 8 | #include 9 | #include 10 | 11 | void register_cpu_platform(Runtime*); 12 | void register_cuda_platform(Runtime*); 13 | void register_opencl_platform(Runtime*); 14 | void register_hsa_platform(Runtime*); 15 | void register_pal_platform(Runtime*); 16 | void register_levelzero_platform(Runtime*); 17 | 18 | /// A runtime platform. Exposes a set of devices, a copy function, 19 | /// and functions to allocate and release memory. 20 | class Platform { 21 | public: 22 | Platform(Runtime* runtime) 23 | : runtime_(runtime) 24 | {} 25 | 26 | virtual ~Platform() {} 27 | 28 | /// Allocates memory for a device on this platform. 29 | virtual void* alloc(DeviceId dev, int64_t size) = 0; 30 | /// Allocates page-locked host memory for a platform (and a device). 31 | virtual void* alloc_host(DeviceId dev, int64_t size) = 0; 32 | /// Allocates unified memory for a platform (and a device). 33 | virtual void* alloc_unified(DeviceId dev, int64_t size) = 0; 34 | /// Returns the device memory associated with the page-locked memory. 35 | virtual void* get_device_ptr(DeviceId dev, void* ptr) = 0; 36 | /// Releases memory for a device on this platform. 37 | virtual void release(DeviceId dev, void* ptr) = 0; 38 | /// Releases page-locked host memory for a device on this platform. 39 | virtual void release_host(DeviceId dev, void* ptr) = 0; 40 | 41 | /// Launches a kernel with the given block/grid size and arguments. 42 | virtual void launch_kernel(DeviceId dev, const LaunchParams& launch_params) = 0; 43 | /// Waits for the completion of all the launched kernels on the given device. 44 | virtual void synchronize(DeviceId dev) = 0; 45 | 46 | /// Copies memory. Copy can only be performed devices in the same platform. 47 | virtual void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0; 48 | /// Copies memory from the host (CPU). 49 | virtual void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0; 50 | /// Copies memory to the host (CPU). 51 | virtual void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) = 0; 52 | 53 | /// Returns the platform name. 54 | virtual std::string name() const = 0; 55 | /// Returns the number of devices in this platform. 56 | virtual size_t dev_count() const = 0; 57 | /// Returns the name of the given device. 58 | virtual const char* device_name(DeviceId dev) const = 0; 59 | /// Checks whether the given platform-specific feature is supported on the given device. 60 | virtual bool device_check_feature_support(DeviceId dev, const char* feature) const = 0; 61 | 62 | protected: 63 | [[noreturn]] void platform_error() { 64 | error("The selected '%' platform is not available", name()); 65 | } 66 | 67 | [[noreturn]] void command_unavailable(const std::string& command) { 68 | error("The command '%' is unavailable on platform '%'", command, name()); 69 | } 70 | 71 | Runtime* runtime_; 72 | }; 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/runtime.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "anydsl_runtime.h" 5 | 6 | #include "runtime.h" 7 | #include "platform.h" 8 | #include "dummy_platform.h" 9 | #include "cpu_platform.h" 10 | 11 | #ifndef AnyDSL_runtime_HAS_CUDA_SUPPORT 12 | void register_cuda_platform(Runtime* runtime) { runtime->register_platform("CUDA"); } 13 | #endif 14 | #ifndef AnyDSL_runtime_HAS_OPENCL_SUPPORT 15 | void register_opencl_platform(Runtime* runtime) { runtime->register_platform("OpenCL"); } 16 | #endif 17 | #ifndef AnyDSL_runtime_HAS_HSA_SUPPORT 18 | void register_hsa_platform(Runtime* runtime) { runtime->register_platform("HSA"); } 19 | #endif 20 | #ifndef AnyDSL_runtime_HAS_PAL_SUPPORT 21 | void register_pal_platform(Runtime* runtime) { runtime->register_platform("PAL"); } 22 | #endif 23 | #ifndef AnyDSL_runtime_HAS_LEVELZERO_SUPPORT 24 | void register_levelzero_platform(Runtime* runtime) { runtime->register_platform("Level Zero"); } 25 | #endif 26 | 27 | Runtime::Runtime(std::pair profile) 28 | : profile_(profile) 29 | , cache_dir_("") 30 | {} 31 | 32 | void Runtime::display_info() const { 33 | info("Available platforms:"); 34 | for (auto& p: platforms_) { 35 | info(" * %: % device(s)", p->name(), p->dev_count()); 36 | for (size_t d=0; ddev_count(); ++d) 37 | info(" + (%) %", d, p->device_name(DeviceId(d))); 38 | } 39 | } 40 | 41 | const char* Runtime::device_name(PlatformId plat, DeviceId dev) const { 42 | check_device(plat, dev); 43 | return platforms_[plat]->device_name(dev); 44 | } 45 | 46 | bool Runtime::device_check_feature_support(PlatformId plat, DeviceId dev, const char* feature) const { 47 | check_device(plat, dev); 48 | return platforms_[plat]->device_check_feature_support(dev, feature); 49 | } 50 | 51 | void* Runtime::alloc(PlatformId plat, DeviceId dev, int64_t size) { 52 | check_device(plat, dev); 53 | return platforms_[plat]->alloc(dev, size); 54 | } 55 | 56 | void* Runtime::alloc_host(PlatformId plat, DeviceId dev, int64_t size) { 57 | check_device(plat, dev); 58 | return platforms_[plat]->alloc_host(dev, size); 59 | } 60 | 61 | void* Runtime::alloc_unified(PlatformId plat, DeviceId dev, int64_t size) { 62 | check_device(plat, dev); 63 | return platforms_[plat]->alloc_unified(dev, size); 64 | } 65 | 66 | void* Runtime::get_device_ptr(PlatformId plat, DeviceId dev, void* ptr) { 67 | check_device(plat, dev); 68 | return platforms_[plat]->get_device_ptr(dev, ptr); 69 | } 70 | 71 | void Runtime::release(PlatformId plat, DeviceId dev, void* ptr) { 72 | check_device(plat, dev); 73 | platforms_[plat]->release(dev, ptr); 74 | } 75 | 76 | void Runtime::release_host(PlatformId plat, DeviceId dev, void* ptr) { 77 | check_device(plat, dev); 78 | platforms_[plat]->release_host(dev, ptr); 79 | } 80 | 81 | void Runtime::copy( 82 | PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src, 83 | PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { 84 | check_device(plat_src, dev_src); 85 | check_device(plat_dst, dev_dst); 86 | if (plat_src == plat_dst) { 87 | // Copy from same platform 88 | platforms_[plat_src]->copy(dev_src, src, offset_src, dev_dst, dst, offset_dst, size); 89 | debug("Copy between devices % and % on platform %", dev_src, dev_dst, plat_src); 90 | } else { 91 | // Copy from another platform 92 | if (plat_src == 0) { 93 | // Source is the CPU platform 94 | platforms_[plat_dst]->copy_from_host(src, offset_src, dev_dst, dst, offset_dst, size); 95 | debug("Copy from host to device % on platform %", dev_dst, plat_dst); 96 | } else if (plat_dst == 0) { 97 | // Destination is the CPU platform 98 | platforms_[plat_src]->copy_to_host(dev_src, src, offset_src, dst, offset_dst, size); 99 | debug("Copy to host from device % on platform %", dev_src, plat_src); 100 | } else { 101 | error("Cannot copy memory between different platforms"); 102 | } 103 | } 104 | } 105 | 106 | void Runtime::launch_kernel(PlatformId plat, DeviceId dev, const LaunchParams& launch_params) { 107 | check_device(plat, dev); 108 | assert(launch_params.grid[0] > 0 && launch_params.grid[0] % launch_params.block[0] == 0 && 109 | launch_params.grid[1] > 0 && launch_params.grid[1] % launch_params.block[1] == 0 && 110 | launch_params.grid[2] > 0 && launch_params.grid[2] % launch_params.block[2] == 0 && 111 | "The grid size is not a multiple of the block size"); 112 | platforms_[plat]->launch_kernel(dev, launch_params); 113 | } 114 | 115 | void Runtime::synchronize(PlatformId plat, DeviceId dev) { 116 | check_device(plat, dev); 117 | platforms_[plat]->synchronize(dev); 118 | } 119 | 120 | #ifdef _WIN32 121 | #include 122 | #define PATH_DIR_SEPARATOR '\\' 123 | #define create_directory(d) _mkdir(d) 124 | #else 125 | #include 126 | #include 127 | #include 128 | #define PATH_DIR_SEPARATOR '/' 129 | #define create_directory(d) { umask(0); mkdir(d, 0755); } 130 | #endif 131 | 132 | #if _XOPEN_SOURCE >= 500 || _POSIX_C_SOURCE >= 200112L || /* Glibc versions <= 2.19: */ _BSD_SOURCE 133 | static std::string get_self_directory() { 134 | char path[PATH_MAX]; 135 | ssize_t len = readlink("/proc/self/exe", path, sizeof(path)-1); 136 | if (len != -1) { 137 | path[len] = '\0'; 138 | 139 | for (int i = len-1; i >= 0; --i) { 140 | if (path[i] == PATH_DIR_SEPARATOR) 141 | return std::string(&path[0], i); 142 | } 143 | } 144 | return std::string(); 145 | } 146 | #elif defined(__APPLE__) 147 | #include 148 | static std::string get_self_directory() { 149 | char path[PATH_MAX]; 150 | uint32_t size = (uint32_t)sizeof(path); 151 | if (_NSGetExecutablePath(path, &size) == 0) { 152 | char resolved[PATH_MAX]; 153 | if (realpath(path, resolved)) { 154 | std::string resolved_path = std::string(resolved); 155 | for (int i = resolved_path.size()-1; i >= 0; --i) { 156 | if (resolved_path[i] == PATH_DIR_SEPARATOR) 157 | return std::string(resolved_path, 0, i); 158 | } 159 | } 160 | } 161 | return std::string(); 162 | } 163 | #elif defined(_WIN32) 164 | #include 165 | static std::string get_self_directory() { 166 | CHAR path[MAX_PATH]; 167 | DWORD nSize = (DWORD)sizeof(path); 168 | DWORD length = GetModuleFileNameA(NULL, path, nSize); 169 | if ((length == 0) || (length == MAX_PATH)) 170 | return std::string(); 171 | 172 | std::string resolved_path(path); 173 | for (std::size_t i = resolved_path.size() - 1; i >= 0; --i) { 174 | if (resolved_path[i] == PATH_DIR_SEPARATOR) 175 | return std::string(resolved_path, 0, i); 176 | } 177 | 178 | return std::string(); 179 | } 180 | #else 181 | static std::string get_self_directory() { 182 | return std::string(); 183 | } 184 | #endif 185 | 186 | void Runtime::set_cache_directory(const std::string& dir) { 187 | cache_dir_ = dir; 188 | } 189 | 190 | std::string Runtime::get_cache_directory() const { 191 | if (cache_dir_.empty()) { 192 | std::string cache_path = get_self_directory(); 193 | if (!cache_path.empty()) 194 | cache_path += PATH_DIR_SEPARATOR; 195 | return cache_path + "cache"; 196 | } else { 197 | return cache_dir_; 198 | } 199 | } 200 | 201 | std::string Runtime::get_cached_filename(const std::string& str, const std::string& ext) const { 202 | size_t key = std::hash{}(str); 203 | std::stringstream hex_stream; 204 | hex_stream << std::hex << key; 205 | return get_cache_directory() + PATH_DIR_SEPARATOR + hex_stream.str() + ext; 206 | } 207 | 208 | inline std::string read_stream(std::istream& stream) { 209 | return std::string(std::istreambuf_iterator(stream), std::istreambuf_iterator()); 210 | } 211 | 212 | std::string Runtime::load_file(const std::string& filename) const { 213 | auto file_it = files_.find(filename); 214 | if (file_it != files_.end()) 215 | return file_it->second; 216 | 217 | std::ifstream src_file(filename); 218 | if (!src_file) 219 | error("Can't open source file '%'", filename); 220 | return read_stream(src_file); 221 | } 222 | 223 | void Runtime::store_file(const std::string& filename, const std::string& str) const { 224 | store_file(filename, reinterpret_cast(str.data()), str.length()); 225 | } 226 | 227 | void Runtime::store_file(const std::string& filename, const std::byte* data, size_t size) const { 228 | std::ofstream dst_file(filename, std::ofstream::binary); 229 | if (!dst_file) 230 | error("Can't open destination file '%'", filename); 231 | dst_file.write(reinterpret_cast(data), size); 232 | } 233 | 234 | std::string Runtime::load_from_cache(const std::string& key, const std::string& ext) const { 235 | std::string filename = get_cached_filename(key, ext); 236 | std::ifstream src_file(filename, std::ifstream::binary); 237 | if (!src_file.is_open()) 238 | return std::string(); 239 | // prevent collision by storing the key in the cached file 240 | size_t size = 0; 241 | if (!src_file.read(reinterpret_cast(&size), sizeof(size_t))) 242 | return std::string(); 243 | auto buf = std::make_unique(size); 244 | if (!src_file.read(buf.get(), size)) 245 | return std::string(); 246 | if (std::memcmp(key.data(), buf.get(), size)) 247 | return std::string(); 248 | debug("Loading from cache: %", filename); 249 | return read_stream(src_file); 250 | } 251 | 252 | void Runtime::store_to_cache(const std::string& key, const std::string& str, const std::string ext) const { 253 | std::string filename = get_cached_filename(key, ext); 254 | create_directory(get_cache_directory().c_str()); 255 | debug("Storing to cache: %", filename); 256 | std::ofstream dst_file(filename, std::ofstream::binary); 257 | size_t size = key.size(); 258 | dst_file.write(reinterpret_cast(&size), sizeof(size_t)); 259 | dst_file.write(key.data(), size); 260 | dst_file.write(str.data(), str.size()); 261 | } 262 | 263 | #if _POSIX_VERSION >= 200112L || _XOPEN_SOURCE >= 600 264 | void* Runtime::aligned_malloc(size_t size, size_t alignment) { 265 | void* p = nullptr; 266 | posix_memalign(&p, alignment, size); 267 | return p; 268 | } 269 | void Runtime::aligned_free(void* ptr) { 270 | free(ptr); 271 | } 272 | #elif _ISOC11_SOURCE 273 | void* Runtime::aligned_malloc(size_t size, size_t alignment) { 274 | return ::aligned_alloc(alignment, size); 275 | } 276 | void Runtime::aligned_free(void* ptr) { 277 | ::free(ptr); 278 | } 279 | #elif defined(_WIN32) || defined(__CYGWIN__) 280 | #include 281 | void* Runtime::aligned_malloc(size_t size, size_t alignment) { 282 | return ::_aligned_malloc(size, alignment); 283 | } 284 | void Runtime::aligned_free(void* ptr) { 285 | ::_aligned_free(ptr); 286 | } 287 | #else 288 | #error "There is no way to allocate aligned memory on this system" 289 | #endif 290 | 291 | void Runtime::check_device(PlatformId plat, DeviceId dev) const { 292 | assert((size_t)dev < platforms_[plat]->dev_count() && "Invalid device"); 293 | unused(plat, dev); 294 | } 295 | -------------------------------------------------------------------------------- /src/runtime.h: -------------------------------------------------------------------------------- 1 | #ifndef RUNTIME_H 2 | #define RUNTIME_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "log.h" 15 | 16 | enum DeviceId : uint32_t {}; 17 | enum PlatformId : uint32_t {}; 18 | enum class ProfileLevel : uint8_t { None = 0, Full, Fpga_dynamic }; 19 | 20 | class Platform; 21 | 22 | enum class KernelArgType : uint8_t { Val = 0, Ptr, Struct }; 23 | 24 | struct ParamsArgs { 25 | void** data; 26 | const uint32_t* sizes; 27 | const uint32_t* aligns; 28 | const uint32_t* alloc_sizes; 29 | const KernelArgType* types; 30 | }; 31 | 32 | /// The parameters to a `anydsl_launch_kernel()` call. 33 | struct LaunchParams { 34 | const char* file_name; 35 | const char* kernel_name; 36 | const uint32_t* grid; 37 | const uint32_t* block; 38 | ParamsArgs args; 39 | uint32_t num_args; 40 | }; 41 | 42 | class Runtime { 43 | public: 44 | Runtime(std::pair); 45 | 46 | /// Registers the given platform into the runtime. 47 | template 48 | void register_platform(Args&&... args) { 49 | platforms_.emplace_back(new T(this, std::forward(args)...)); 50 | } 51 | 52 | /// Displays available platforms. 53 | void display_info() const; 54 | 55 | /// Returns name of device. 56 | const char* device_name(PlatformId, DeviceId) const; 57 | /// Checks whether feature is supported on device. 58 | bool device_check_feature_support(PlatformId, DeviceId, const char*) const; 59 | 60 | /// Allocates memory on the given device. 61 | void* alloc(PlatformId plat, DeviceId dev, int64_t size); 62 | /// Allocates page-locked memory on the given platform and device. 63 | void* alloc_host(PlatformId plat, DeviceId dev, int64_t size); 64 | /// Allocates unified memory on the given platform and device. 65 | void* alloc_unified(PlatformId plat, DeviceId dev, int64_t size); 66 | /// Returns the device memory associated with the page-locked memory. 67 | void* get_device_ptr(PlatformId plat, DeviceId dev, void* ptr); 68 | /// Releases memory. 69 | void release(PlatformId plat, DeviceId dev, void* ptr); 70 | /// Releases previously allocated page-locked memory. 71 | void release_host(PlatformId plat, DeviceId dev, void* ptr); 72 | /// Copies memory between devices. 73 | void copy( 74 | PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src, 75 | PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size); 76 | 77 | /// Launches a kernel on the platform and device. 78 | void launch_kernel(PlatformId plat, DeviceId dev, const LaunchParams& launch_params); 79 | /// Waits for the completion of all kernels on the given platform and device. 80 | void synchronize(PlatformId plat, DeviceId dev); 81 | 82 | /// Associate a program string to a given filename. 83 | void register_file(const std::string& filename, const std::string& program_string) { 84 | files_[filename] = program_string; 85 | } 86 | 87 | std::string load_file(const std::string& filename) const; 88 | void store_file(const std::string& filename, const std::string& str) const; 89 | void store_file(const std::string& filename, const std::byte* data, size_t size) const; 90 | 91 | /// Set an optional directory for generated cache data. If not specified, or empty, an internal directory will be used. User has to make sure the directory exists. 92 | void set_cache_directory(const std::string& dir); 93 | std::string get_cache_directory() const; 94 | 95 | std::string load_from_cache(const std::string& str, const std::string& ext=".bin") const; 96 | void store_to_cache(const std::string& key, const std::string& str, const std::string ext=".bin") const; 97 | 98 | bool profiling_enabled() { return profile_.first == ProfileLevel::Full; } 99 | bool dynamic_profiling_enabled() { return profile_.second == ProfileLevel::Fpga_dynamic; } 100 | std::atomic& kernel_time() { return kernel_time_; } 101 | 102 | static void* aligned_malloc(size_t, size_t); 103 | static void aligned_free(void*); 104 | 105 | private: 106 | void check_device(PlatformId, DeviceId) const; 107 | std::string get_cached_filename(const std::string& str, const std::string& ext) const; 108 | 109 | std::pair profile_; 110 | std::atomic kernel_time_; 111 | std::vector> platforms_; 112 | std::unordered_map files_; 113 | std::string cache_dir_; 114 | }; 115 | 116 | #endif 117 | --------------------------------------------------------------------------------