├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTORS.txt
├── LICENSE.TXT
├── README.md
├── cmake
    ├── anydsl_runtime-config.cmake.in
    ├── build_xilinx_hls.cmake.in
    ├── check_nvvmir.cmake
    └── modules
    │   ├── FindLevelZero.cmake
    │   └── FindXHLS.cmake
├── platforms
    ├── artic
    │   ├── intrinsics.impala
    │   ├── intrinsics_amdgpu.impala
    │   ├── intrinsics_cpu.impala
    │   ├── intrinsics_cuda.impala
    │   ├── intrinsics_hls.impala
    │   ├── intrinsics_levelzero.impala
    │   ├── intrinsics_math.impala
    │   ├── intrinsics_nvvm.impala
    │   ├── intrinsics_opencl.impala
    │   ├── intrinsics_rv.impala
    │   ├── intrinsics_spirv.impala
    │   ├── intrinsics_thorin.impala
    │   ├── intrinsics_wmma.impala
    │   └── runtime.impala
    └── impala
    │   ├── intrinsics.impala
    │   ├── intrinsics_amdgpu.impala
    │   ├── intrinsics_cpu.impala
    │   ├── intrinsics_cuda.impala
    │   ├── intrinsics_hls.impala
    │   ├── intrinsics_nvvm.impala
    │   ├── intrinsics_opencl.impala
    │   ├── intrinsics_rv.impala
    │   ├── intrinsics_thorin.impala
    │   └── runtime.impala
├── post-patcher.py
└── src
    ├── CMakeLists.txt
    ├── anydsl_jit.h
    ├── anydsl_runtime.cpp
    ├── anydsl_runtime.h
    ├── anydsl_runtime.hpp
    ├── anydsl_runtime_config.h.in
    ├── cpu_platform.cpp
    ├── cpu_platform.h
    ├── cuda_platform.cpp
    ├── cuda_platform.h
    ├── dummy_platform.h
    ├── extract_runtime_srcs.py
    ├── hsa_platform.cpp
    ├── hsa_platform.h
    ├── jit.cpp
    ├── levelzero_platform.cpp
    ├── levelzero_platform.h
    ├── log.h
    ├── opencl_platform.cpp
    ├── opencl_platform.h
    ├── pal
        ├── pal_device.cpp
        ├── pal_device.h
        ├── pal_fix_calling_convention_pass.cpp
        ├── pal_fix_calling_convention_pass.h
        ├── pal_insert_halt_pass.cpp
        ├── pal_insert_halt_pass.h
        ├── pal_lower_builtins_pass.cpp
        ├── pal_lower_builtins_pass.h
        ├── pal_lower_kernel_arguments_pass.cpp
        ├── pal_lower_kernel_arguments_pass.h
        ├── pal_utils.cpp
        └── pal_utils.h
    ├── pal_platform.cpp
    ├── pal_platform.h
    ├── platform.h
    ├── runtime.cpp
    └── runtime.h


/.gitignore:
--------------------------------------------------------------------------------
1 | build*
2 | 
3 | .vscode


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
 2 | 
 3 | project(AnyDSL-runtime)
 4 | 
 5 | set(PACKAGE_VERSION "0.3.9")
 6 | #set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "limited config" FORCE)
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | 
10 | option(BUILD_SHARED_LIBS "Build shared libraries" ON)
11 | option(RUNTIME_JIT "enable jit support in the runtime" OFF)
12 | option(DEBUG_OUTPUT "enable debug output" OFF)
13 | 
14 | if(CMAKE_BUILD_TYPE STREQUAL "")
15 |     set(CMAKE_BUILD_TYPE Debug CACHE STRING "Debug or Release" FORCE)
16 | endif()
17 | 
18 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules)
19 | 
20 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
21 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
22 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
23 | 
24 | find_path(Artic_DIR NAMES artic-config.cmake PATHS ${Artic_DIR} $ENV{Artic_DIR} ${CMAKE_BINARY_DIR}/share/anydsl/cmake)
25 | find_path(Impala_DIR NAMES impala-config.cmake PATHS ${Impala_DIR} $ENV{Impala_DIR} ${CMAKE_BINARY_DIR}/share/anydsl/cmake)
26 | 
27 | set(AnyDSL_runtime_ENABLE_DEBUG_OUTPUT ${DEBUG_OUTPUT})
28 | set(AnyDSL_runtime_TARGET_NAME runtime CACHE STRING "Name of the cmake target for the AnyDSL runtime")
29 | mark_as_advanced(AnyDSL_runtime_TARGET_NAME)
30 | 
31 | add_subdirectory(src)
32 | 
33 | message(STATUS "Using Debug flags: ${CMAKE_CXX_FLAGS_DEBUG}")
34 | message(STATUS "Using Release flags: ${CMAKE_CXX_FLAGS_RELEASE}")
35 | if(DEFINED CMAKE_BUILD_TYPE)
36 |     message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
37 | endif()
38 | 
39 | export(TARGETS ${RUNTIME_LIBRARIES} FILE ${CMAKE_BINARY_DIR}/share/anydsl/cmake/anydsl_runtime-exports.cmake)
40 | configure_file(cmake/anydsl_runtime-config.cmake.in ${CMAKE_BINARY_DIR}/share/anydsl/cmake/anydsl_runtime-config.cmake @ONLY)
41 | configure_file(cmake/build_xilinx_hls.cmake.in ${CMAKE_BINARY_DIR}/share/anydsl/cmake/build_xilinx_hls.cmake @ONLY)
42 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.txt:
--------------------------------------------------------------------------------
 1 | # This is the official list of contributing authors in the AnyDSL runtime project for copyright purposes.
 2 | 
 3 | # Name (GitHub Handle), Affiliation(s)
 4 | Puya Amiri (pooyaww), DFKI
 5 | Hugo Devillers (Hugobros3), Saarland University
 6 | Pascal Grittmann (pgrit), Saarland University
 7 | Ralf Jung (RalfJung), Saarland University
 8 | Michael Kenzel (michael-kenzel), DFKI
 9 | Marcel Köster (m4rs-mt), Saarland University
10 | Matthis Kruse (DasNaCl), Saarland University
11 | Matthias Kurtenacker (m-kurtenacker), DFKI
12 | Roland Leißa (leissa), Saarland University
13 | Stefan Lemme (stlemme), Saarland University / DFKI
14 | Richard Membarth (richardmembarth), Saarland University / DFKI / Technische Hochschule Ingolstadt
15 | Simon Moll (simoll), Saarland University
16 | Arsène Pérard-Gayot (madmann91), Saarland University
17 | Akif Özkan (akifoezkan), Friedrich-Alexander-University Erlangen-Nuremberg
18 | Alexander Rath (iRath96), DFKI
19 | Till Speicher (tillspeicher), Saarland University
20 | Fabian Wildgrube (FabianWildgrube), Advanced Micro Devices Inc.
21 | Ömercan Yazici (PearCoding), Saarland University
22 | 


--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AnyDSL Runtime Library
 2 | The runtime for the AnyDSL framework and its two frontends [artic](https://github.com/AnyDSL/artic) and [impala](https://github.com/AnyDSL/impala).
 3 | 
 4 | The runtime provides the following components:
 5 | - CMake logic to build programs using artic or impala
 6 | - include files for basic runtime abstractions and math functions
 7 | - runtime library implementation to schedule and execute AnyDSL programs on different platforms
 8 |   + Host (CPU): standard platform for code
 9 |     + TBB / C++11 threads: code emitted by `parallel`
10 |     + LLVM w/ RV support: code emitted by `vectorize`
11 |   + CUDA: code emitted by `cuda` or `nvvm`
12 |   + OpenCL: code emitted by `opencl`
13 |   + HSA: code emitted by `amdgpu`
14 | 
15 | CMake automatically search for available components on the current system.
16 | To prevent CMake from building a particular runtime component, disable it using CMake's `CMAKE_DISABLE_FIND_PACKAGE_<PackageName>` variable.
17 | For example, pass `-DCMAKE_DISABLE_FIND_PACKAGE_OpenCL=TRUE` to cmake to disable the OpenCL runtime component.
18 | 
19 | Although not required, feel free to specify `Artic_DIR` or `Impala_DIR` for your convenience to later automatically find the correct paths when building AnyDSL programs using the `anydsl_runtime_wrap()` function.
20 | 
21 | To enable JIT support, please pass `-DRUNTIME_JIT=ON` to cmake.
22 | This will require atleast one of artic or impala as dependencies and thereby locate LLVM as well as [thorin](https://github.com/AnyDSL/thorin) too.
23 | 


--------------------------------------------------------------------------------
/cmake/check_nvvmir.cmake:
--------------------------------------------------------------------------------
1 | if(EXISTS ${_basename}.nvvm)
2 |     execute_process(COMMAND ${LLVM_AS_BIN} ${_basename}.nvvm)
3 | endif()
4 | 


--------------------------------------------------------------------------------
/cmake/modules/FindLevelZero.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2019 Intel Corporation
 2 | # SPDX-License-Identifier: MIT
 3 | include(FindPackageHandleStandardArgs)
 4 | 
 5 | find_path(LevelZero_INCLUDE_DIR
 6 |   NAMES level_zero/ze_api.h
 7 | )
 8 | 
 9 | find_library(LevelZero_LIBRARY
10 |   NAMES ze_loader ze_loader32 ze_loader64
11 | )
12 | 
13 | find_package_handle_standard_args(LevelZero
14 |   REQUIRED_VARS
15 |     LevelZero_INCLUDE_DIR
16 |     LevelZero_LIBRARY
17 |   HANDLE_COMPONENTS
18 | )
19 | mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIR)
20 | 
21 | if(LevelZero_FOUND)
22 |     list(APPEND LevelZero_LIBRARIES ${LevelZero_LIBRARY} ${CMAKE_DL_LIBS})
23 |     list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
24 |     if(OpenCL_FOUND)
25 |         list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
26 |     endif()
27 | 	MESSAGE(STATUS "Found Level Zero in " ${LevelZero_INCLUDE_DIR})
28 | endif()
29 | 
30 | if(LevelZero_FOUND AND NOT TARGET LevelZero::LevelZero)
31 |     add_library(LevelZero::LevelZero INTERFACE IMPORTED)
32 |     set_target_properties(LevelZero::LevelZero
33 |       PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
34 |     )
35 |     set_target_properties(LevelZero::LevelZero
36 |       PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
37 |     )
38 | endif()
39 | 
40 | # MESSAGE(STATUS "LevelZero_LIBRARIES: " ${LevelZero_LIBRARIES})
41 | # MESSAGE(STATUS "LevelZero_INCLUDE_DIRS: " ${LevelZero_INCLUDE_DIRS})
42 | 


--------------------------------------------------------------------------------
/cmake/modules/FindXHLS.cmake:
--------------------------------------------------------------------------------
 1 | # Xilinx Runtime library (XRT) and HLS tools for scripting mode
 2 | 
 3 | find_path(XILINX_SEARCH_PATH v++ PATHS ENV XILINX_OPENCL ENV XILINX_VITIS PATH_SUFFIXES bin)
 4 | get_filename_component(VITIS_ROOT_DIR ${XILINX_SEARCH_PATH} DIRECTORY)
 5 | 
 6 | find_program(Xilinx_VPP v++ PATHS ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
 7 | find_program(Xilinx_PLATFORM_INFO platforminfo PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
 8 | find_program(Xilinx_KERNEL_INFO kernelinfo PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
 9 | find_program(Xilinx_EMU_CONFIG emconfigutil PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
10 | 
11 | 
12 | get_filename_component(VITIS_VERSION "${VITIS_ROOT_DIR}" NAME)
13 | string(REGEX REPLACE "([0-9]+)\\.[0-9]+" "\\1" VITIS_MAJOR_VERSION "${VITIS_VERSION}")
14 | string(REGEX REPLACE "[0-9]+\\.([0-9]+)" "\\1" VITIS_MINOR_VERSION "${VITIS_VERSION}")
15 | set(Vitis_VERSION ${VITIS_VERSION})
16 | set(Vitis_MAJOR_VERSION ${VITIS_MAJOR_VERSION})
17 | set(Vitis_MINOR_VERSION ${VITIS_MINOR_VERSION})
18 | 
19 | find_program(Xilinx_HLS NAMES vitis_hls PATHS ${VITIS_ROOT_DIR}/bin ${VITIS_ROOT_DIR}/../../Vitis_HLS/${Vitis_VERSION}/bin NO_DEFAULT_PATH)
20 | 
21 | find_path(Xilinx_HLS_INCLUDE_DIR hls_stream.h PATHS ${VITIS_ROOT_DIR}/include NO_DEFAULT_PATH)
22 | 
23 | find_path(XRT_SEARCH_PATH libxilinxopencl.so PATHS /opt/xilinx/xrt ENV XILINX_XRT PATH_SUFFIXES lib)
24 | get_filename_component(XILINX_RUNTIME_DIR ${XRT_SEARCH_PATH} DIRECTORY)
25 | file(GLOB Xilinx_LIBRARIES ${XILINX_RUNTIME_DIR}/lib/libxilinxopencl.so)
26 | 
27 | find_path(Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR cl_ext.h PATHS ${XILINX_RUNTIME_DIR}/include PATH_SUFFIXES CL NO_DEFAULT_PATH)
28 | get_filename_component(Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR ${Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR} DIRECTORY)
29 | set(Xilinx_INCLUDE_DIRS ${Xilinx_HLS_INCLUDE_DIR} ${Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR})
30 | 
31 | mark_as_advanced(
32 |     XILINX_RUNTIME_DIR
33 |     XRT_SEARCH_PATH
34 |     XILINX_SEARCH_PATH
35 |     Xilinx_HLS
36 |     Xilinx_VPP
37 |     Xilinx_HLS_INCLUDE_DIR
38 |     Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR
39 |     Xilinx_PLATFORM_INFO
40 |     Xilinx_KERNEL_INFO
41 |     Xilinx_EMU_CONFIG
42 |     Xilinx_LIBRARIES
43 |     Xilinx_INCLUDE_DIRS)
44 | 
45 | include(FindPackageHandleStandardArgs)
46 | find_package_handle_standard_args(XHLS DEFAULT_MSG
47 |     Xilinx_HLS
48 |     Xilinx_VPP
49 |     Xilinx_LIBRARIES
50 |     Xilinx_INCLUDE_DIRS
51 |     Xilinx_PLATFORM_INFO
52 |     Xilinx_KERNEL_INFO
53 |     Xilinx_EMU_CONFIG
54 | )
55 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics.impala:
--------------------------------------------------------------------------------
 1 | struct WorkItem {
 2 |     tidx  : fn() -> i32,
 3 |     tidy  : fn() -> i32,
 4 |     tidz  : fn() -> i32,
 5 |     bidx  : fn() -> i32,
 6 |     bidy  : fn() -> i32,
 7 |     bidz  : fn() -> i32,
 8 |     gidx  : fn() -> i32,
 9 |     gidy  : fn() -> i32,
10 |     gidz  : fn() -> i32,
11 |     bdimx : fn() -> i32,
12 |     bdimy : fn() -> i32,
13 |     bdimz : fn() -> i32,
14 |     gdimx : fn() -> i32,
15 |     gdimy : fn() -> i32,
16 |     gdimz : fn() -> i32,
17 |     nblkx : fn() -> i32,
18 |     nblky : fn() -> i32,
19 |     nblkz : fn() -> i32
20 | }
21 | 
22 | struct Accelerator {
23 |     exec          : fn(fn(WorkItem) -> ()) -> fn((i32, i32, i32), (i32, i32, i32)) -> (), // fn(grid, block)->()
24 |     sync          : fn() -> (),
25 |     alloc         : fn(i64) -> Buffer,
26 |     alloc_unified : fn(i64) -> Buffer,
27 |     barrier       : fn() -> ()
28 | }
29 | 
30 | struct Intrinsics {
31 |     expf      : fn(f32) -> f32,
32 |     exp2f     : fn(f32) -> f32,
33 |     logf      : fn(f32) -> f32,
34 |     log2f     : fn(f32) -> f32,
35 |     powf      : fn(f32, f32) -> f32,
36 |     rsqrtf    : fn(f32) -> f32,
37 |     sqrtf     : fn(f32) -> f32,
38 |     fabsf     : fn(f32) -> f32,
39 |     sinf      : fn(f32) -> f32,
40 |     cosf      : fn(f32) -> f32,
41 |     tanf      : fn(f32) -> f32,
42 |     asinf     : fn(f32) -> f32,
43 |     acosf     : fn(f32) -> f32,
44 |     atanf     : fn(f32) -> f32,
45 |     erff      : fn(f32) -> f32,
46 |     atan2f    : fn(f32, f32) -> f32,
47 |     copysignf : fn(f32, f32) -> f32,
48 |     fmaf      : fn(f32, f32, f32) -> f32,
49 |     fmaxf     : fn(f32, f32) -> f32,
50 |     fminf     : fn(f32, f32) -> f32,
51 |     fmodf     : fn(f32, f32) -> f32,
52 |     floorf    : fn(f32) -> f32,
53 |     isinff    : fn(f32) -> i32,
54 |     isnanf    : fn(f32) -> i32,
55 |     isfinitef : fn(f32) -> i32,
56 |     exp       : fn(f64) -> f64,
57 |     exp2      : fn(f64) -> f64,
58 |     log       : fn(f64) -> f64,
59 |     log2      : fn(f64) -> f64,
60 |     pow       : fn(f64, f64) -> f64,
61 |     rsqrt     : fn(f64) -> f64,
62 |     sqrt      : fn(f64) -> f64,
63 |     fabs      : fn(f64) -> f64,
64 |     sin       : fn(f64) -> f64,
65 |     cos       : fn(f64) -> f64,
66 |     tan       : fn(f64) -> f64,
67 |     asin      : fn(f64) -> f64,
68 |     acos      : fn(f64) -> f64,
69 |     atan      : fn(f64) -> f64,
70 |     erf       : fn(f64) -> f64,
71 |     atan2     : fn(f64, f64) -> f64,
72 |     copysign  : fn(f64, f64) -> f64,
73 |     fma       : fn(f64, f64, f64) -> f64,
74 |     fmax      : fn(f64, f64) -> f64,
75 |     fmin      : fn(f64, f64) -> f64,
76 |     fmod      : fn(f64, f64) -> f64,
77 |     floor     : fn(f64) -> f64,
78 |     isinf     : fn(f64) -> i32,
79 |     isnan     : fn(f64) -> i32,
80 |     isfinite  : fn(f64) -> i32,
81 |     min       : fn(i32, i32) -> i32,
82 |     max       : fn(i32, i32) -> i32,
83 | }
84 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_cpu.impala:
--------------------------------------------------------------------------------
  1 | //#[import(cc = "C", name = "sinf")]              fn cpu_sinf(f32) -> f32;
  2 | //#[import(cc = "C", name = "cosf")]              fn cpu_cosf(f32) -> f32;
  3 | #[import(cc = "C", name = "tanf")]              fn cpu_tanf(_: f32) -> f32;
  4 | #[import(cc = "C", name = "asinf")]             fn cpu_asinf(_: f32) -> f32;
  5 | #[import(cc = "C", name = "acosf")]             fn cpu_acosf(_: f32) -> f32;
  6 | #[import(cc = "C", name = "atanf")]             fn cpu_atanf(_: f32) -> f32;
  7 | #[import(cc = "C", name = "erff")]              fn cpu_erff(_: f32) -> f32;
  8 | #[import(cc = "C", name = "fmodf")]             fn cpu_fmodf(_: f32, _: f32) -> f32;
  9 | #[import(cc = "C", name = "atan2f")]            fn cpu_atan2f(_: f32, _: f32) -> f32;
 10 | #[import(cc = "C", name = "anydsl_isinff")]     fn cpu_isinff(_: f32) -> i32;
 11 | #[import(cc = "C", name = "anydsl_isnanf")]     fn cpu_isnanf(_: f32) -> i32;
 12 | #[import(cc = "C", name = "anydsl_isfinitef")]  fn cpu_isfinitef(_: f32) -> i32;
 13 | //#[import(cc = "C", name = "sin")]               fn cpu_sin(f64) -> f64;
 14 | //#[import(cc = "C", name = "cos")]               fn cpu_cos(f64) -> f64;
 15 | #[import(cc = "C", name = "tan")]               fn cpu_tan(_: f64) -> f64;
 16 | #[import(cc = "C", name = "asin")]              fn cpu_asin(_: f64) -> f64;
 17 | #[import(cc = "C", name = "acos")]              fn cpu_acos(_: f64) -> f64;
 18 | #[import(cc = "C", name = "atan")]              fn cpu_atan(_: f64) -> f64;
 19 | #[import(cc = "C", name = "erf")]               fn cpu_erf(_: f64) -> f64;
 20 | #[import(cc = "C", name = "fmod")]              fn cpu_fmod(_: f64, _: f64) -> f64;
 21 | #[import(cc = "C", name = "atan2")]             fn cpu_atan2(_: f64, _: f64) -> f64;
 22 | #[import(cc = "C", name = "anydsl_isinf")]      fn cpu_isinf(_: f64) -> i32;
 23 | #[import(cc = "C", name = "anydsl_isnan")]      fn cpu_isnan(_: f64) -> i32;
 24 | #[import(cc = "C", name = "anydsl_isfinite")]   fn cpu_isfinite(_: f64) -> i32;
 25 | 
 26 | #[import(cc = "device", name = "llvm.exp.f32")]         fn cpu_expf(_: f32) -> f32;
 27 | #[import(cc = "device", name = "llvm.exp2.f32")]        fn cpu_exp2f(_: f32) -> f32;
 28 | #[import(cc = "device", name = "llvm.log.f32")]         fn cpu_logf(_: f32) -> f32;
 29 | #[import(cc = "device", name = "llvm.log2.f32")]        fn cpu_log2f(_: f32) -> f32;
 30 | #[import(cc = "device", name = "llvm.pow.f32")]         fn cpu_powf(_: f32, _: f32) -> f32;
 31 | #[import(cc = "device", name = "llvm.sqrt.f32")]        fn cpu_sqrtf(_: f32) -> f32;
 32 | #[import(cc = "device", name = "llvm.fabs.f32")]        fn cpu_fabsf(_: f32) -> f32;
 33 | #[import(cc = "device", name = "llvm.sin.f32")]         fn cpu_sinf(_: f32) -> f32;
 34 | #[import(cc = "device", name = "llvm.cos.f32")]         fn cpu_cosf(_: f32) -> f32;
 35 | #[import(cc = "device", name = "llvm.floor.f32")]       fn cpu_floorf(_: f32) -> f32;
 36 | #[import(cc = "device", name = "llvm.fma.f32")]         fn cpu_fmaf(_: f32, _: f32, _: f32) -> f32;
 37 | #[import(cc = "device", name = "llvm.fmuladd.f32")]     fn cpu_madf(_: f32, _: f32, _: f32) -> f32;
 38 | #[import(cc = "device", name = "llvm.copysign.f32")]    fn cpu_copysignf(_: f32, _: f32) -> f32;
 39 | #[import(cc = "device", name = "llvm.minnum.f32")]      fn cpu_fminf(_: f32, _: f32) -> f32;
 40 | #[import(cc = "device", name = "llvm.maxnum.f32")]      fn cpu_fmaxf(_: f32, _: f32) -> f32;
 41 | #[import(cc = "device", name = "llvm.exp.f64")]         fn cpu_exp(_: f64) -> f64;
 42 | #[import(cc = "device", name = "llvm.exp2.f64")]        fn cpu_exp2(_: f64) -> f64;
 43 | #[import(cc = "device", name = "llvm.log.f64")]         fn cpu_log(_: f64) -> f64;
 44 | #[import(cc = "device", name = "llvm.log2.f64")]        fn cpu_log2(_: f64) -> f64;
 45 | #[import(cc = "device", name = "llvm.pow.f64")]         fn cpu_pow(_: f64, _: f64) -> f64;
 46 | #[import(cc = "device", name = "llvm.sqrt.f64")]        fn cpu_sqrt(_: f64) -> f64;
 47 | #[import(cc = "device", name = "llvm.fabs.f64")]        fn cpu_fabs(_: f64) -> f64;
 48 | #[import(cc = "device", name = "llvm.sin.f64")]         fn cpu_sin(_: f64) -> f64;
 49 | #[import(cc = "device", name = "llvm.cos.f64")]         fn cpu_cos(_: f64) -> f64;
 50 | #[import(cc = "device", name = "llvm.floor.f64")]       fn cpu_floor(_: f64) -> f64;
 51 | #[import(cc = "device", name = "llvm.fma.f64")]         fn cpu_fma(_: f64, _: f64, _: f64) -> f64;
 52 | #[import(cc = "device", name = "llvm.fmuladd.f64")]     fn cpu_mad(_: f64, _: f64, _: f64) -> f64;
 53 | #[import(cc = "device", name = "llvm.copysign.f64")]    fn cpu_copysign(_: f64, _: f64) -> f64;
 54 | #[import(cc = "device", name = "llvm.minnum.f64")]      fn cpu_fmin(_: f64, _: f64) -> f64;
 55 | #[import(cc = "device", name = "llvm.maxnum.f64")]      fn cpu_fmax(_: f64, _: f64) -> f64;
 56 | #[import(cc = "device", name = "llvm.ctpop.i32")]       fn cpu_popcount32(_: i32) -> i32;
 57 | #[import(cc = "device", name = "llvm.ctpop.i64")]       fn cpu_popcount64(_: i64) -> i64;
 58 | #[import(cc = "device", name = "llvm.ctlz.i32")]        fn cpu_clz32(_: i32, _: bool) -> i32;
 59 | #[import(cc = "device", name = "llvm.ctlz.i64")]        fn cpu_clz64(_: i64, _: bool) -> i64;
 60 | #[import(cc = "device", name = "llvm.cttz.i32")]        fn cpu_ctz32(_: i32, _: bool) -> i32;
 61 | #[import(cc = "device", name = "llvm.cttz.i64")]        fn cpu_ctz64(_: i64, _: bool) -> i64;
 62 | #[import(cc = "device", name = "llvm.x86.bmi.pext.32")] fn cpu_pext32(_: i32, _: i32) -> i32;
 63 | #[import(cc = "device", name = "llvm.x86.bmi.pext.64")] fn cpu_pext64(_: i64, _: i64) -> i64;
 64 | #[import(cc = "device", name = "llvm.x86.bmi.pdep.32")] fn cpu_pdep32(_: i32, _: i32) -> i32;
 65 | #[import(cc = "device", name = "llvm.x86.bmi.pdep.64")] fn cpu_pdep64(_: i64, _: i64) -> i64;
 66 | #[import(cc = "device", name = "llvm.prefetch.p0")]   fn cpu_prefetch(&u8, i32, i32, i32) -> ();
 67 | 
 68 | //
 69 | // atomics
 70 | //            0    1   2   3   4    5  6   7   8   9    10   11   12
 71 | // operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
 72 | //            0         1         2         4       5       6              7
 73 | // ordering:  NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
 74 | // syncscope: singlethread "" (system)
 75 | //
 76 | 
 77 | fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 = atomic[i32](0, a, b, 7, "");
 78 | fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32  = atomic[i32](1, a, b, 7, "");
 79 | fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32  = atomic[i32](2, a, b, 7, "");
 80 | fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32  = atomic[i32](7, a, b, 7, "");
 81 | fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32  = atomic[i32](8, a, b, 7, "");
 82 | 
 83 | static cpu_intrinsics = Intrinsics {
 84 |     expf        = cpu_expf,
 85 |     exp2f       = cpu_exp2f,
 86 |     logf        = cpu_logf,
 87 |     log2f       = cpu_log2f,
 88 |     powf        = cpu_powf,
 89 |     rsqrtf      = @|a| { 1:f32 / cpu_sqrtf(a) },
 90 |     sqrtf       = cpu_sqrtf,
 91 |     fabsf       = cpu_fabsf,
 92 |     sinf        = cpu_sinf,
 93 |     cosf        = cpu_cosf,
 94 |     tanf        = cpu_tanf,
 95 |     asinf       = cpu_asinf,
 96 |     acosf       = cpu_acosf,
 97 |     atanf       = cpu_atanf,
 98 |     erff        = cpu_erff,
 99 |     atan2f      = cpu_atan2f,
100 |     copysignf   = cpu_copysignf,
101 |     fmaf        = cpu_fmaf,
102 |     fmaxf       = cpu_fmaxf,
103 |     fminf       = cpu_fminf,
104 |     fmodf       = cpu_fmodf,
105 |     floorf      = cpu_floorf,
106 |     isinff      = cpu_isinff,
107 |     isnanf      = cpu_isnanf,
108 |     isfinitef   = cpu_isfinitef,
109 |     exp         = cpu_exp,
110 |     exp2        = cpu_exp2,
111 |     log         = cpu_log,
112 |     log2        = cpu_log2,
113 |     pow         = cpu_pow,
114 |     rsqrt       = @|a| { 1.0 / cpu_sqrt(a) },
115 |     sqrt        = cpu_sqrt,
116 |     fabs        = cpu_fabs,
117 |     sin         = cpu_sin,
118 |     cos         = cpu_cos,
119 |     tan         = cpu_tan,
120 |     asin        = cpu_asin,
121 |     acos        = cpu_acos,
122 |     atan        = cpu_atan,
123 |     erf         = cpu_erf,
124 |     atan2       = cpu_atan2,
125 |     copysign    = cpu_copysign,
126 |     fma         = cpu_fma,
127 |     fmax        = cpu_fmax,
128 |     fmin        = cpu_fmin,
129 |     fmod        = cpu_fmod,
130 |     floor       = cpu_floor,
131 |     isinf       = cpu_isinf,
132 |     isnan       = cpu_isnan,
133 |     isfinite    = cpu_isfinite,
134 |     min         = @|a, b| { if a < b { a } else { b } },
135 |     max         = @|a, b| { if a > b { a } else { b } },
136 | };
137 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_hls.impala:
--------------------------------------------------------------------------------
  1 | // no declarations are emitted for "device" functions
  2 | #[import(cc = "C", name = "exp")]      fn hls_expf(f32) -> f32;
  3 | #[import(cc = "C", name = "exp2")]     fn hls_exp2f(f32) -> f32;
  4 | #[import(cc = "C", name = "log")]      fn hls_logf(f32) -> f32;
  5 | #[import(cc = "C", name = "log2")]     fn hls_log2f(f32) -> f32;
  6 | #[import(cc = "C", name = "pow")]      fn hls_powf(f32, f32) -> f32;
  7 | #[import(cc = "C", name = "rsqrt")]    fn hls_rsqrtf(f32) -> f32;
  8 | #[import(cc = "C", name = "sqrt")]     fn hls_sqrtf(f32) -> f32;
  9 | #[import(cc = "C", name = "fabs")]     fn hls_fabsf(f32) -> f32;
 10 | #[import(cc = "C", name = "sin")]      fn hls_sinf(f32) -> f32;
 11 | #[import(cc = "C", name = "cos")]      fn hls_cosf(f32) -> f32;
 12 | #[import(cc = "C", name = "tan")]      fn hls_tanf(f32) -> f32;
 13 | #[import(cc = "C", name = "asin")]     fn hls_asinf(f32) -> f32;
 14 | #[import(cc = "C", name = "acos")]     fn hls_acosf(f32) -> f32;
 15 | #[import(cc = "C", name = "atan")]     fn hls_atanf(f32) -> f32;
 16 | #[import(cc = "C", name = "erf")]      fn hls_erff(f32) -> f32;
 17 | #[import(cc = "C", name = "atan2")]    fn hls_atan2f(f32, f32) -> f32;
 18 | #[import(cc = "C", name = "fmod")]     fn hls_fmodf(f32, f32) -> f32;
 19 | #[import(cc = "C", name = "floor")]    fn hls_floorf(f32) -> f32;
 20 | #[import(cc = "C", name = "isinf")]    fn hls_isinff(f32) -> i32;
 21 | #[import(cc = "C", name = "isnan")]    fn hls_isnanf(f32) -> i32;
 22 | #[import(cc = "C", name = "isfinite")] fn hls_isfinitef(f32) -> i32;
 23 | #[import(cc = "C", name = "fma")]      fn hls_fmaf(f32, f32, f32) -> f32;
 24 | #[import(cc = "C", name = "mad")]      fn hls_madf(f32, f32, f32) -> f32;
 25 | #[import(cc = "C", name = "copysign")] fn hls_copysignf(f32, f32) -> f32;
 26 | #[import(cc = "C", name = "exp")]      fn hls_exp(f64) -> f64;
 27 | #[import(cc = "C", name = "exp2")]     fn hls_exp2(f64) -> f64;
 28 | #[import(cc = "C", name = "log")]      fn hls_log(f64) -> f64;
 29 | #[import(cc = "C", name = "log2")]     fn hls_log2(f64) -> f64;
 30 | #[import(cc = "C", name = "pow")]      fn hls_pow(f64, f64) -> f64;
 31 | #[import(cc = "C", name = "rsqrt")]    fn hls_rsqrt(f64) -> f64;
 32 | #[import(cc = "C", name = "sqrt")]     fn hls_sqrt(f64) -> f64;
 33 | #[import(cc = "C", name = "fabs")]     fn hls_fabs(f64) -> f64;
 34 | #[import(cc = "C", name = "sin")]      fn hls_sin(f64) -> f64;
 35 | #[import(cc = "C", name = "cos")]      fn hls_cos(f64) -> f64;
 36 | #[import(cc = "C", name = "tan")]      fn hls_tan(f64) -> f64;
 37 | #[import(cc = "C", name = "asin")]     fn hls_asin(f64) -> f64;
 38 | #[import(cc = "C", name = "acos")]     fn hls_acos(f64) -> f64;
 39 | #[import(cc = "C", name = "atan")]     fn hls_atan(f64) -> f64;
 40 | #[import(cc = "C", name = "erf")]      fn hls_erf(f64) -> f64;
 41 | #[import(cc = "C", name = "atan2")]    fn hls_atan2(f64, f64) -> f64;
 42 | #[import(cc = "C", name = "fmod")]     fn hls_fmod(f64, f64) -> f64;
 43 | #[import(cc = "C", name = "floor")]    fn hls_floor(f64) -> f64;
 44 | #[import(cc = "C", name = "isinf")]    fn hls_isinf(f64) -> i32;
 45 | #[import(cc = "C", name = "isnan")]    fn hls_isnan(f64) -> i32;
 46 | #[import(cc = "C", name = "isfinite")] fn hls_isfinite(f64) -> i32;
 47 | #[import(cc = "C", name = "fma")]      fn hls_fma(f64, f64, f64) -> f64;
 48 | #[import(cc = "C", name = "mad")]      fn hls_mad(f64, f64, f64) -> f64;
 49 | #[import(cc = "C", name = "copysign")] fn hls_copysign(f64, f64) -> f64;
 50 | #[import(cc = "C", name = "fmin")]     fn hls_fminf(f32, f32) -> f32;
 51 | #[import(cc = "C", name = "fmax")]     fn hls_fmaxf(f32, f32) -> f32;
 52 | #[import(cc = "C", name = "fmin")]     fn hls_fmin(f64, f64) -> f64;
 53 | #[import(cc = "C", name = "fmax")]     fn hls_fmax(f64, f64) -> f64;
 54 | #[import(cc = "C", name = "min")]      fn hls_min(i32, i32) -> i32;
 55 | #[import(cc = "C", name = "max")]      fn hls_max(i32, i32) -> i32;
 56 | 
 57 | #[import(cc = "device")] fn print_pragma(&[u8]) -> ();
 58 | 
 59 | // channel scalar types
 60 | struct channel[T] { data : T }
 61 | // channel array types
 62 | struct channel1[T]   { data : [T * 1  ] }
 63 | struct channel2[T]   { data : [T * 2  ] }
 64 | struct channel4[T]   { data : [T * 4  ] }
 65 | struct channel8[T]   { data : [T * 8  ] }
 66 | struct channel16[T]  { data : [T * 16 ] }
 67 | struct channel32[T]  { data : [T * 32 ] }
 68 | struct channel64[T]  { data : [T * 64 ] }
 69 | struct channel128[T] { data : [T * 128] }
 70 | 
 71 | // read and write on scalar channels
 72 | #[import(cc = "device", name = "read_channel")] fn read_channel[T]  (&mut channel[T]) ->  T;
 73 | #[import(cc = "device", name = "write_channel")] fn write_channel[T] (&mut channel[T],  T ) -> ();
 74 | 
 75 | // read and write on array channels
 76 | #[import(cc = "device", name = "read_channel")] fn read_channel1[T] ( &mut channel1[T] ) -> [T * 1 ];
 77 | #[import(cc = "device", name = "read_channel")] fn read_channel2[T] ( &mut channel2[T] ) -> [T * 2 ];
 78 | #[import(cc = "device", name = "read_channel")] fn read_channel4[T] ( &mut channel4[T] ) -> [T * 4 ];
 79 | #[import(cc = "device", name = "read_channel")] fn read_channel8[T] ( &mut channel8[T] ) -> [T * 8 ];
 80 | #[import(cc = "device", name = "read_channel")] fn read_channel16[T]( &mut channel16[T]) -> [T * 16];
 81 | #[import(cc = "device", name = "read_channel")] fn read_channel32[T]( &mut channel32[T]) -> [T * 32];
 82 | 
 83 | #[import(cc = "device", name = "write_channel")] fn write_channel1[T] ( &mut channel1[T],  [T * 1 ]) -> ();
 84 | #[import(cc = "device", name = "write_channel")] fn write_channel2[T] ( &mut channel2[T],  [T * 2 ]) -> ();
 85 | #[import(cc = "device", name = "write_channel")] fn write_channel4[T] ( &mut channel4[T],  [T * 4 ]) -> ();
 86 | #[import(cc = "device", name = "write_channel")] fn write_channel8[T] ( &mut channel8[T],  [T * 8 ]) -> ();
 87 | #[import(cc = "device", name = "write_channel")] fn write_channel16[T]( &mut channel16[T], [T * 16]) -> ();
 88 | #[import(cc = "device", name = "write_channel")] fn write_channel32[T]( &mut channel32[T], [T * 32]) -> ();
 89 | #[import(cc = "device", name = " ")]             fn bitcast_channel[T]( &mut channel1[T]) -> [T * 2];
 90 | 
 91 | fn @hls_accelerator(dev: i32) = Accelerator {
 92 |     exec = @|body| |_grid, _block| {
 93 |         let work_item = WorkItem {
 94 |             tidx  = @|| 0, tidy  = @|| 0, tidz  = @|| 0,
 95 |             bidx  = @|| 0, bidy  = @|| 0, bidz  = @|| 0,
 96 |             gidx  = @|| 0, gidy  = @|| 0, gidz  = @|| 0,
 97 |             bdimx = @|| 1, bdimy = @|| 1, bdimz = @|| 1,
 98 |             gdimx = @|| 1, gdimy = @|| 1, gdimz = @|| 1,
 99 |             nblkx = @|| 1, nblky = @|| 1, nblkz = @|| 1
100 |         };
101 |         hls(dev, || @body(work_item));
102 |     },
103 |     sync          = @|| synchronize_hls(dev),
104 |     alloc         = @|size| alloc_hls(dev, size),
105 |     alloc_unified = @|size| alloc_hls_unified(dev, size),
106 |     barrier       = @|| ()
107 | };
108 | 
109 | static hls_intrinsics = Intrinsics {
110 |     expf        = hls_expf,
111 |     exp2f       = hls_exp2f,
112 |     logf        = hls_logf,
113 |     log2f       = hls_log2f,
114 |     powf        = hls_powf,
115 |     rsqrtf      = hls_rsqrtf,
116 |     sqrtf       = hls_sqrtf,
117 |     fabsf       = hls_fabsf,
118 |     sinf        = hls_sinf,
119 |     cosf        = hls_cosf,
120 |     tanf        = hls_tanf,
121 |     asinf       = hls_asinf,
122 |     acosf       = hls_acosf,
123 |     atanf       = hls_atanf,
124 |     erff        = hls_erff,
125 |     atan2f      = hls_atan2f,
126 |     copysignf   = hls_copysignf,
127 |     fmaf        = hls_fmaf,
128 |     fmaxf       = hls_fmaxf,
129 |     fminf       = hls_fminf,
130 |     fmodf       = hls_fmodf,
131 |     floorf      = hls_floorf,
132 |     isinff      = hls_isinff,
133 |     isnanf      = hls_isnanf,
134 |     isfinitef   = hls_isfinitef,
135 |     exp         = hls_exp,
136 |     exp2        = hls_exp2,
137 |     log         = hls_log,
138 |     log2        = hls_log2,
139 |     pow         = hls_pow,
140 |     rsqrt       = hls_rsqrt,
141 |     sqrt        = hls_sqrt,
142 |     fabs        = hls_fabs,
143 |     sin         = hls_sin,
144 |     cos         = hls_cos,
145 |     tan         = hls_tan,
146 |     asin        = hls_asin,
147 |     acos        = hls_acos,
148 |     atan        = hls_atan,
149 |     erf         = hls_erf,
150 |     atan2       = hls_atan2,
151 |     copysign    = hls_copysign,
152 |     fma         = hls_fma,
153 |     fmax        = hls_fmax,
154 |     fmin        = hls_fmin,
155 |     fmod        = hls_fmod,
156 |     floor       = hls_floor,
157 |     isinf       = hls_isinf,
158 |     isnan       = hls_isnan,
159 |     isfinite    = hls_isfinite,
160 |     min         = hls_min,
161 |     max         = hls_max,
162 | };
163 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_levelzero.impala:
--------------------------------------------------------------------------------
 1 | // most device intrinsics are the same as OpenCL and don't need to be duplicated
 2 | fn spv_levelzero_get_num_groups()  = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */);
 3 | fn spv_levelzero_get_local_size()  = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */);
 4 | fn spv_levelzero_get_group_id()    = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */);
 5 | fn spv_levelzero_get_local_id()    = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */);
 6 | fn spv_levelzero_get_global_id()   = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */);
 7 | fn spv_levelzero_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */);
 8 | 
 9 | fn @levelzero_accelerator(dev: i32) = Accelerator {
10 |     exec          = @|body| |grid, block| {
11 |         let work_item = WorkItem {
12 |             tidx  = @|| spv_levelzero_get_local_id()(0) as i32,
13 |             tidy  = @|| spv_levelzero_get_local_id()(1) as i32,
14 |             tidz  = @|| spv_levelzero_get_local_id()(2) as i32,
15 |             bidx  = @|| spv_levelzero_get_group_id()(0) as i32,
16 |             bidy  = @|| spv_levelzero_get_group_id()(1) as i32,
17 |             bidz  = @|| spv_levelzero_get_group_id()(2) as i32,
18 |             gidx  = @|| spv_levelzero_get_global_id()(0) as i32,
19 |             gidy  = @|| spv_levelzero_get_global_id()(1) as i32,
20 |             gidz  = @|| spv_levelzero_get_global_id()(2) as i32,
21 |             bdimx = @|| spv_levelzero_get_local_size()(0) as i32,
22 |             bdimy = @|| spv_levelzero_get_local_size()(1) as i32,
23 |             bdimz = @|| spv_levelzero_get_local_size()(2) as i32,
24 |             gdimx = @|| spv_levelzero_get_global_size()(0) as i32,
25 |             gdimy = @|| spv_levelzero_get_global_size()(1) as i32,
26 |             gdimz = @|| spv_levelzero_get_global_size()(2) as i32,
27 |             nblkx = @|| spv_levelzero_get_num_groups()(0) as i32,
28 |             nblky = @|| spv_levelzero_get_num_groups()(1) as i32,
29 |             nblkz = @|| spv_levelzero_get_num_groups()(2) as i32
30 |         };
31 |         levelzero(dev, grid, block, || @body(work_item))
32 |     },
33 |     sync          = @|| synchronize_levelzero(dev),
34 |     alloc         = @|size| alloc_levelzero(dev, size),
35 |     alloc_unified = @|size| alloc_levelzero_unified(dev, size),
36 |     barrier       = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
37 | };
38 | 
39 | static levelzero_intrinsics = Intrinsics {
40 |     expf        = opencl_expf,
41 |     exp2f       = opencl_exp2f,
42 |     logf        = opencl_logf,
43 |     log2f       = opencl_log2f,
44 |     powf        = opencl_powf,
45 |     rsqrtf      = opencl_rsqrtf,
46 |     sqrtf       = opencl_sqrtf,
47 |     fabsf       = opencl_fabsf,
48 |     sinf        = opencl_sinf,
49 |     cosf        = opencl_cosf,
50 |     tanf        = opencl_tanf,
51 |     asinf       = opencl_asinf,
52 |     acosf       = opencl_acosf,
53 |     atanf       = opencl_atanf,
54 |     erff        = opencl_erff,
55 |     atan2f      = opencl_atan2f,
56 |     copysignf   = opencl_copysignf,
57 |     fmaf        = opencl_fmaf,
58 |     fmaxf       = opencl_fmaxf,
59 |     fminf       = opencl_fminf,
60 |     fmodf       = opencl_fmodf,
61 |     floorf      = opencl_floorf,
62 |     isinff      = opencl_isinff,
63 |     isnanf      = opencl_isnanf,
64 |     isfinitef   = opencl_isfinitef,
65 |     exp         = opencl_exp,
66 |     exp2        = opencl_exp2,
67 |     log         = opencl_log,
68 |     log2        = opencl_log2,
69 |     pow         = opencl_pow,
70 |     rsqrt       = opencl_rsqrt,
71 |     sqrt        = opencl_sqrt,
72 |     fabs        = opencl_fabs,
73 |     sin         = opencl_sin,
74 |     cos         = opencl_cos,
75 |     tan         = opencl_tan,
76 |     asin        = opencl_asin,
77 |     acos        = opencl_acos,
78 |     atan        = opencl_atan,
79 |     erf         = opencl_erf,
80 |     atan2       = opencl_atan2,
81 |     copysign    = opencl_copysign,
82 |     fma         = opencl_fma,
83 |     fmax        = opencl_fmax,
84 |     fmin        = opencl_fmin,
85 |     fmod        = opencl_fmod,
86 |     floor       = opencl_floor,
87 |     isinf       = opencl_isinf,
88 |     isnan       = opencl_isnan,
89 |     isfinite    = opencl_isfinite,
90 |     min         = opencl_min,
91 |     max         = opencl_max,
92 | };
93 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_math.impala:
--------------------------------------------------------------------------------
 1 | mod math_builtins {
 2 | 
 3 | #[import(cc = "builtin")] fn fabs[T](T) -> T;
 4 | #[import(cc = "builtin")] fn copysign[T](T, T) -> T;
 5 | #[import(cc = "builtin")] fn signbit[T](T) -> bool;
 6 | #[import(cc = "builtin")] fn round[T](T) -> T;
 7 | #[import(cc = "builtin")] fn ceil[T](T) -> T;
 8 | #[import(cc = "builtin")] fn floor[T](T) -> T;
 9 | #[import(cc = "builtin")] fn fmin[T](T, T) -> T;
10 | #[import(cc = "builtin")] fn fmax[T](T, T) -> T;
11 | #[import(cc = "builtin")] fn cos[T](T) -> T;
12 | #[import(cc = "builtin")] fn sin[T](T) -> T;
13 | #[import(cc = "builtin")] fn tan[T](T) -> T;
14 | #[import(cc = "builtin")] fn acos[T](T) -> T;
15 | #[import(cc = "builtin")] fn asin[T](T) -> T;
16 | #[import(cc = "builtin")] fn atan[T](T) -> T;
17 | #[import(cc = "builtin")] fn atan2[T](T, T) -> T;
18 | #[import(cc = "builtin")] fn sqrt[T](T) -> T;
19 | #[import(cc = "builtin")] fn cbrt[T](T) -> T;
20 | #[import(cc = "builtin")] fn pow[T](T, T) -> T;
21 | #[import(cc = "builtin")] fn exp[T](T) -> T;
22 | #[import(cc = "builtin")] fn exp2[T](T) -> T;
23 | #[import(cc = "builtin")] fn log[T](T) -> T;
24 | #[import(cc = "builtin")] fn log2[T](T) -> T;
25 | #[import(cc = "builtin")] fn log10[T](T) -> T;
26 | #[import(cc = "builtin")] fn isnan[T](T) -> bool;
27 | #[import(cc = "builtin")] fn isfinite[T](T) -> bool;
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_opencl.impala:
--------------------------------------------------------------------------------
  1 | // no declarations are emitted for "device" functions
  2 | #[import(cc = "device", name = "barrier")]           fn opencl_barrier(u32) -> ();
  3 | #[import(cc = "device", name = "exp")]               fn opencl_expf(f32) -> f32;
  4 | #[import(cc = "device", name = "exp2")]              fn opencl_exp2f(f32) -> f32;
  5 | #[import(cc = "device", name = "log")]               fn opencl_logf(f32) -> f32;
  6 | #[import(cc = "device", name = "log2")]              fn opencl_log2f(f32) -> f32;
  7 | #[import(cc = "device", name = "pow")]               fn opencl_powf(f32, f32) -> f32;
  8 | #[import(cc = "device", name = "rsqrt")]             fn opencl_rsqrtf(f32) -> f32;
  9 | #[import(cc = "device", name = "sqrt")]              fn opencl_sqrtf(f32) -> f32;
 10 | #[import(cc = "device", name = "fabs")]              fn opencl_fabsf(f32) -> f32;
 11 | #[import(cc = "device", name = "sin")]               fn opencl_sinf(f32) -> f32;
 12 | #[import(cc = "device", name = "cos")]               fn opencl_cosf(f32) -> f32;
 13 | #[import(cc = "device", name = "tan")]               fn opencl_tanf(f32) -> f32;
 14 | #[import(cc = "device", name = "asin")]              fn opencl_asinf(f32) -> f32;
 15 | #[import(cc = "device", name = "acos")]              fn opencl_acosf(f32) -> f32;
 16 | #[import(cc = "device", name = "atan")]              fn opencl_atanf(f32) -> f32;
 17 | #[import(cc = "device", name = "erf")]               fn opencl_erff(f32) -> f32;
 18 | #[import(cc = "device", name = "atan2")]             fn opencl_atan2f(f32, f32) -> f32;
 19 | #[import(cc = "device", name = "fmod")]              fn opencl_fmodf(f32, f32) -> f32;
 20 | #[import(cc = "device", name = "floor")]             fn opencl_floorf(f32) -> f32;
 21 | #[import(cc = "device", name = "isinf")]             fn opencl_isinff(f32) -> i32;
 22 | #[import(cc = "device", name = "isnan")]             fn opencl_isnanf(f32) -> i32;
 23 | #[import(cc = "device", name = "isfinite")]          fn opencl_isfinitef(f32) -> i32;
 24 | #[import(cc = "device", name = "fma")]               fn opencl_fmaf(f32, f32, f32) -> f32;
 25 | #[import(cc = "device", name = "mad")]               fn opencl_madf(f32, f32, f32) -> f32;
 26 | #[import(cc = "device", name = "copysign")]          fn opencl_copysignf(f32, f32) -> f32;
 27 | #[import(cc = "device", name = "exp")]               fn opencl_exp(f64) -> f64;
 28 | #[import(cc = "device", name = "exp2")]              fn opencl_exp2(f64) -> f64;
 29 | #[import(cc = "device", name = "log")]               fn opencl_log(f64) -> f64;
 30 | #[import(cc = "device", name = "log2")]              fn opencl_log2(f64) -> f64;
 31 | #[import(cc = "device", name = "pow")]               fn opencl_pow(f64, f64) -> f64;
 32 | #[import(cc = "device", name = "rsqrt")]             fn opencl_rsqrt(f64) -> f64;
 33 | #[import(cc = "device", name = "sqrt")]              fn opencl_sqrt(f64) -> f64;
 34 | #[import(cc = "device", name = "fabs")]              fn opencl_fabs(f64) -> f64;
 35 | #[import(cc = "device", name = "sin")]               fn opencl_sin(f64) -> f64;
 36 | #[import(cc = "device", name = "cos")]               fn opencl_cos(f64) -> f64;
 37 | #[import(cc = "device", name = "tan")]               fn opencl_tan(f64) -> f64;
 38 | #[import(cc = "device", name = "asin")]              fn opencl_asin(f64) -> f64;
 39 | #[import(cc = "device", name = "acos")]              fn opencl_acos(f64) -> f64;
 40 | #[import(cc = "device", name = "atan")]              fn opencl_atan(f64) -> f64;
 41 | #[import(cc = "device", name = "erf")]               fn opencl_erf(f64) -> f64;
 42 | #[import(cc = "device", name = "atan2")]             fn opencl_atan2(f64, f64) -> f64;
 43 | #[import(cc = "device", name = "fmod")]              fn opencl_fmod(f64, f64) -> f64;
 44 | #[import(cc = "device", name = "floor")]             fn opencl_floor(f64) -> f64;
 45 | #[import(cc = "device", name = "isinf")]             fn opencl_isinf(f64) -> i32;
 46 | #[import(cc = "device", name = "isnan")]             fn opencl_isnan(f64) -> i32;
 47 | #[import(cc = "device", name = "isfinite")]          fn opencl_isfinite(f64) -> i32;
 48 | #[import(cc = "device", name = "fma")]               fn opencl_fma(f64, f64, f64) -> f64;
 49 | #[import(cc = "device", name = "mad")]               fn opencl_mad(f64, f64, f64) -> f64;
 50 | #[import(cc = "device", name = "copysign")]          fn opencl_copysign(f64, f64) -> f64;
 51 | #[import(cc = "device", name = "fmin")]              fn opencl_fminf(f32, f32) -> f32;
 52 | #[import(cc = "device", name = "fmax")]              fn opencl_fmaxf(f32, f32) -> f32;
 53 | #[import(cc = "device", name = "fmin")]              fn opencl_fmin(f64, f64) -> f64;
 54 | #[import(cc = "device", name = "fmax")]              fn opencl_fmax(f64, f64) -> f64;
 55 | #[import(cc = "device", name = "min")]               fn opencl_min(i32, i32) -> i32;
 56 | #[import(cc = "device", name = "max")]               fn opencl_max(i32, i32) -> i32;
 57 | #[import(cc = "device", name = "atomic_add")]        fn opencl_atomic_add_global(&mut addrspace(1)i32, i32) -> i32;
 58 | #[import(cc = "device", name = "atomic_add")]        fn opencl_atomic_add_global_f32(&mut addrspace(1)f32, f32) -> f32;
 59 | #[import(cc = "device", name = "atomic_add")]        fn opencl_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32;
 60 | #[import(cc = "device", name = "atomic_min")]        fn opencl_atomic_min_global(&mut addrspace(1)i32, i32) -> i32;
 61 | #[import(cc = "device", name = "atomic_min")]        fn opencl_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32;
 62 | #[import(cc = "device", name = "get_work_dim")]      fn opencl_get_work_dim() -> u32;
 63 | #[import(cc = "device", name = "get_global_size")]   fn opencl_get_global_size(u32) -> u64;
 64 | #[import(cc = "device", name = "get_global_id")]     fn opencl_get_global_id(u32) -> u64;
 65 | #[import(cc = "device", name = "get_local_size")]    fn opencl_get_local_size(u32) -> u64;
 66 | #[import(cc = "device", name = "get_local_id")]      fn opencl_get_local_id(u32) -> u64;
 67 | #[import(cc = "device", name = "get_num_groups")]    fn opencl_get_num_groups(u32) -> u64;
 68 | #[import(cc = "device", name = "get_group_id")]      fn opencl_get_group_id(u32) -> u64;
 69 | #[import(cc = "device", name = "get_global_offset")] fn opencl_get_global_offset(u32) -> u64;
 70 | 
 71 | static CLK_LOCAL_MEM_FENCE  = 1:u32;
 72 | static CLK_GLOBAL_MEM_FENCE = 2:u32;
 73 | 
 74 | fn @opencl_accelerator(dev: i32) = Accelerator {
 75 |     exec          = @|body| |grid, block| {
 76 |         let work_item = WorkItem {
 77 |             tidx  = @|| opencl_get_local_id(0) as i32,
 78 |             tidy  = @|| opencl_get_local_id(1) as i32,
 79 |             tidz  = @|| opencl_get_local_id(2) as i32,
 80 |             bidx  = @|| opencl_get_group_id(0) as i32,
 81 |             bidy  = @|| opencl_get_group_id(1) as i32,
 82 |             bidz  = @|| opencl_get_group_id(2) as i32,
 83 |             gidx  = @|| opencl_get_global_id(0) as i32,
 84 |             gidy  = @|| opencl_get_global_id(1) as i32,
 85 |             gidz  = @|| opencl_get_global_id(2) as i32,
 86 |             bdimx = @|| opencl_get_local_size(0) as i32,
 87 |             bdimy = @|| opencl_get_local_size(1) as i32,
 88 |             bdimz = @|| opencl_get_local_size(2) as i32,
 89 |             gdimx = @|| opencl_get_global_size(0) as i32,
 90 |             gdimy = @|| opencl_get_global_size(1) as i32,
 91 |             gdimz = @|| opencl_get_global_size(2) as i32,
 92 |             nblkx = @|| opencl_get_num_groups(0) as i32,
 93 |             nblky = @|| opencl_get_num_groups(1) as i32,
 94 |             nblkz = @|| opencl_get_num_groups(2) as i32
 95 |         };
 96 |         opencl(dev, grid, block, || @body(work_item))
 97 |     },
 98 |     sync          = @|| synchronize_opencl(dev),
 99 |     alloc         = @|size| alloc_opencl(dev, size),
100 |     alloc_unified = @|size| alloc_opencl_unified(dev, size),
101 |     barrier       = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
102 | };
103 | 
104 | fn spv_cl_get_num_groups()  = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */);
105 | fn spv_cl_get_local_size()  = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */);
106 | fn spv_cl_get_group_id()    = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */);
107 | fn spv_cl_get_local_id()    = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */);
108 | fn spv_cl_get_global_id()   = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */);
109 | fn spv_cl_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */);
110 | 
111 | fn @opencl_spirv_accelerator(dev: i32) = Accelerator {
112 |     exec          = @|body| |grid, block| {
113 |         let work_item = WorkItem {
114 |             tidx  = @|| spv_cl_get_local_id()(0) as i32,
115 |             tidy  = @|| spv_cl_get_local_id()(1) as i32,
116 |             tidz  = @|| spv_cl_get_local_id()(2) as i32,
117 |             bidx  = @|| spv_cl_get_group_id()(0) as i32,
118 |             bidy  = @|| spv_cl_get_group_id()(1) as i32,
119 |             bidz  = @|| spv_cl_get_group_id()(2) as i32,
120 |             gidx  = @|| spv_cl_get_global_id()(0) as i32,
121 |             gidy  = @|| spv_cl_get_global_id()(1) as i32,
122 |             gidz  = @|| spv_cl_get_global_id()(2) as i32,
123 |             bdimx = @|| spv_cl_get_local_size()(0) as i32,
124 |             bdimy = @|| spv_cl_get_local_size()(1) as i32,
125 |             bdimz = @|| spv_cl_get_local_size()(2) as i32,
126 |             gdimx = @|| spv_cl_get_global_size()(0) as i32,
127 |             gdimy = @|| spv_cl_get_global_size()(1) as i32,
128 |             gdimz = @|| spv_cl_get_global_size()(2) as i32,
129 |             nblkx = @|| spv_cl_get_num_groups()(0) as i32,
130 |             nblky = @|| spv_cl_get_num_groups()(1) as i32,
131 |             nblkz = @|| spv_cl_get_num_groups()(2) as i32
132 |         };
133 |         opencl_spirv(dev, grid, block, || @body(work_item))
134 |     },
135 |     sync          = @|| synchronize_opencl(dev),
136 |     alloc         = @|size| alloc_opencl(dev, size),
137 |     alloc_unified = @|size| alloc_opencl_unified(dev, size),
138 |     barrier       = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
139 | };
140 | 
141 | static opencl_intrinsics = Intrinsics {
142 |     expf        = opencl_expf,
143 |     exp2f       = opencl_exp2f,
144 |     logf        = opencl_logf,
145 |     log2f       = opencl_log2f,
146 |     powf        = opencl_powf,
147 |     rsqrtf      = opencl_rsqrtf,
148 |     sqrtf       = opencl_sqrtf,
149 |     fabsf       = opencl_fabsf,
150 |     sinf        = opencl_sinf,
151 |     cosf        = opencl_cosf,
152 |     tanf        = opencl_tanf,
153 |     asinf       = opencl_asinf,
154 |     acosf       = opencl_acosf,
155 |     atanf       = opencl_atanf,
156 |     erff        = opencl_erff,
157 |     atan2f      = opencl_atan2f,
158 |     copysignf   = opencl_copysignf,
159 |     fmaf        = opencl_fmaf,
160 |     fmaxf       = opencl_fmaxf,
161 |     fminf       = opencl_fminf,
162 |     fmodf       = opencl_fmodf,
163 |     floorf      = opencl_floorf,
164 |     isinff      = opencl_isinff,
165 |     isnanf      = opencl_isnanf,
166 |     isfinitef   = opencl_isfinitef,
167 |     exp         = opencl_exp,
168 |     exp2        = opencl_exp2,
169 |     log         = opencl_log,
170 |     log2        = opencl_log2,
171 |     pow         = opencl_pow,
172 |     rsqrt       = opencl_rsqrt,
173 |     sqrt        = opencl_sqrt,
174 |     fabs        = opencl_fabs,
175 |     sin         = opencl_sin,
176 |     cos         = opencl_cos,
177 |     tan         = opencl_tan,
178 |     asin        = opencl_asin,
179 |     acos        = opencl_acos,
180 |     atan        = opencl_atan,
181 |     erf         = opencl_erf,
182 |     atan2       = opencl_atan2,
183 |     copysign    = opencl_copysign,
184 |     fma         = opencl_fma,
185 |     fmax        = opencl_fmax,
186 |     fmin        = opencl_fmin,
187 |     fmod        = opencl_fmod,
188 |     floor       = opencl_floor,
189 |     isinf       = opencl_isinf,
190 |     isnan       = opencl_isnan,
191 |     isfinite    = opencl_isfinite,
192 |     min         = opencl_min,
193 |     max         = opencl_max,
194 | };
195 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_rv.impala:
--------------------------------------------------------------------------------
 1 | #[import(cc = "C")] fn rv_mask() -> bool;
 2 | #[import(cc = "C")] fn rv_any(_: bool) -> bool;
 3 | #[import(cc = "C")] fn rv_all(_: bool) -> bool;
 4 | #[import(cc = "C")] fn rv_ballot(_: bool) -> i32;
 5 | #[import(cc = "C")] fn rv_extract(_: f32, _: i32) -> f32;
 6 | #[import(cc = "C")] fn rv_insert(_: f32, _: i32, _: f32) -> f32;
 7 | #[import(cc = "C")] fn rv_load(_: &f32, _: i32) -> f32;
 8 | #[import(cc = "C")] fn rv_store(_: &mut f32, _: i32, _: f32) -> (); 
 9 | #[import(cc = "C")] fn rv_shuffle(_: f32, _: i32) -> f32;
10 | #[import(cc = "C")] fn rv_align(_: &i8, _: i32)-> &i8;
11 | #[import(cc = "C")] fn rv_compact(_: f32, _: bool) -> f32;
12 | #[import(cc = "C")] fn rv_lane_id() -> i32;
13 | #[import(cc = "C")] fn rv_num_lanes() -> i32;
14 | 


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_spirv.impala:
--------------------------------------------------------------------------------
1 | #[import(cc = "device", name = "spirv.builtin")] fn spirv_get_builtin[T](i32) -> T;


--------------------------------------------------------------------------------
/platforms/artic/intrinsics_thorin.impala:
--------------------------------------------------------------------------------
 1 | #[import(cc = "builtin")] fn undef[T]() -> T;
 2 | #[import(cc = "builtin")] fn sizeof[_]() -> i64;
 3 | #[import(cc = "builtin")] fn alignof[_]() -> i64;
 4 | #[import(cc = "builtin")] fn bitcast[T, U](_src: U) -> T;
 5 | #[import(cc = "builtin")] fn select[T, U](_cond: T, _true: U, _false: U) -> U;
 6 | #[import(cc = "builtin")] fn insert[T, U](_tuple: T, _index: i32, _value: U) -> T;
 7 | 
 8 | #[import(cc = "thorin")] fn atomic[T](_binop: u32, _addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
 9 | #[import(cc = "thorin")] fn atomic_load[T](_addr: &T, _order: u32, _scope: &[u8]) -> T;
10 | #[import(cc = "thorin")] fn atomic_store[T](_addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> ();
11 | #[import(cc = "thorin")] fn cmpxchg[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
12 | #[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
13 | #[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> ();
14 | #[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> ();
15 | #[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
16 | #[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
17 | #[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
18 | #[import(cc = "thorin")] fn opencl_spirv(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
19 | #[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
20 | #[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
21 | #[import(cc = "thorin")] fn levelzero(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
22 | #[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T];
23 | #[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> ();
24 | #[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend
25 | #[import(cc = "thorin", name = "parallel")] fn thorin_parallel(_num_threads: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> ();
26 | #[import(cc = "thorin", name = "spawn")] fn thorin_spawn(_body: fn() -> ()) -> i32;
27 | #[import(cc = "thorin")] fn sync(_id: i32) -> ();
28 | #[import(cc = "thorin")] fn vectorize(_vector_length: i32, _body: fn(i32) -> ()) -> ();
29 | 
30 | #[import(cc = "thorin", name = "atomic")] fn atomic_p1[T](_binop: u32, _addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> T;
31 | #[import(cc = "thorin", name = "atomic")] fn atomic_p3[T](_binop: u32, _addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> T;
32 | #[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p1[T](_addr: &addrspace(1)T, _order: u32, _scope: &[u8]) -> T;
33 | #[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p3[T](_addr: &addrspace(3)T, _order: u32, _scope: &[u8]) -> T;
34 | #[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p1[T](_addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> ();
35 | #[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p3[T](_addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> ();
36 | #[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
37 | #[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
38 | #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
39 | #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
40 | 
41 | fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body);
42 | fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body);
43 | fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body);
44 | 


--------------------------------------------------------------------------------
/platforms/artic/runtime.impala:
--------------------------------------------------------------------------------
  1 | #[import(cc = "C", name = "anydsl_info")]                         fn runtime_info() -> ();
  2 | #[import(cc = "C", name = "anydsl_device_name")]                  fn runtime_device_name(_device: i32) -> &[u8];
  3 | #[import(cc = "C", name = "anydsl_device_check_feature_support")] fn runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool;
  4 | 
  5 | #[import(cc = "C", name = "anydsl_alloc")]          fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8];
  6 | #[import(cc = "C", name = "anydsl_alloc_host")]     fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8];
  7 | #[import(cc = "C", name = "anydsl_alloc_unified")]  fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8];
  8 | #[import(cc = "C", name = "anydsl_copy")]           fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> ();
  9 | #[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8];
 10 | #[import(cc = "C", name = "anydsl_synchronize")]    fn runtime_synchronize(_device: i32) -> ();
 11 | #[import(cc = "C", name = "anydsl_release")]        fn runtime_release(_device: i32, _ptr: &[i8]) -> ();
 12 | #[import(cc = "C", name = "anydsl_release_host")]   fn runtime_release_host(_device: i32, _ptr: &[i8]) -> ();
 13 | 
 14 | #[import(cc = "C", name = "anydsl_random_seed")]    fn random_seed(_: u32) -> ();
 15 | #[import(cc = "C", name = "anydsl_random_val_f32")] fn random_val_f32() -> f32;
 16 | #[import(cc = "C", name = "anydsl_random_val_u64")] fn random_val_u64() -> u64;
 17 | 
 18 | #[import(cc = "C", name = "anydsl_get_micro_time")]  fn get_micro_time() -> i64;
 19 | #[import(cc = "C", name = "anydsl_get_nano_time")]   fn get_nano_time() -> i64;
 20 | #[import(cc = "C", name = "anydsl_get_kernel_time")] fn get_kernel_time() -> i64;
 21 | 
 22 | #[import(cc = "C", name = "anydsl_print_i16")]    fn print_i16(_: i16) -> ();
 23 | #[import(cc = "C", name = "anydsl_print_i32")]    fn print_i32(_: i32) -> ();
 24 | #[import(cc = "C", name = "anydsl_print_i64")]    fn print_i64(_: i64) -> ();
 25 | #[import(cc = "C", name = "anydsl_print_u16")]    fn print_u16(_: u16) -> ();
 26 | #[import(cc = "C", name = "anydsl_print_u32")]    fn print_u32(_: u32) -> ();
 27 | #[import(cc = "C", name = "anydsl_print_u64")]    fn print_u64(_: u64) -> ();
 28 | #[import(cc = "C", name = "anydsl_print_f32")]    fn print_f32(_: f32) -> ();
 29 | #[import(cc = "C", name = "anydsl_print_f64")]    fn print_f64(_: f64) -> ();
 30 | #[import(cc = "C", name = "anydsl_print_char")]   fn print_char(_: u8) -> ();
 31 | #[import(cc = "C", name = "anydsl_print_string")] fn print_string(_: &[u8]) -> ();
 32 | #[import(cc = "C", name = "anydsl_print_flush")]  fn print_flush() -> ();
 33 | 
 34 | // TODO
 35 | //struct Buffer[T] {
 36 | //    data : &mut [T],
 37 | //    size : i64,
 38 | //    device : i32
 39 | //}
 40 | //
 41 | //fn @alloc[T](device: i32, size: i64) = Buffer[T] {
 42 | //    data = runtime_alloc(device, size * sizeof[T]()) as &mut [T],
 43 | //    size = size,
 44 | //    device = device
 45 | //};
 46 | //fn @alloc_host[T](device: i32, size: i64) = Buffer[T] {
 47 | //    data = runtime_alloc_host(device, size * sizeof[T]()) as &mut [T],
 48 | //    size = size,
 49 | //    device = device
 50 | //};
 51 | //fn @alloc_unified[T](device: i32, size: i64) = Buffer[T] {
 52 | //    data = runtime_alloc_unified(device, size * sizeof[T]()) as &mut [T],
 53 | //    size = size,
 54 | //    device = device
 55 | //};
 56 | //
 57 | //fn @release[T](buf: Buffer[T]) = runtime_release(buf.device, buf.data as &[i8]);
 58 | //fn @alloc_cpu[T](size: i64) = alloc[T](0, size);
 59 | //fn @alloc_cuda[T](dev: i32, size: i64) = alloc[T](runtime_device(1, dev), size);
 60 | //fn @alloc_cuda_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(1, dev), size);
 61 | //fn @alloc_cuda_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(1, dev), size);
 62 | //fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev));
 63 | //fn @alloc_opencl[T](dev: i32, size: i64) = alloc[T](runtime_device(2, dev), size);
 64 | //fn @alloc_opencl_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(2, dev), size);
 65 | //fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev));
 66 | //fn @alloc_hsa[T](dev: i32, size: i64) = alloc[T](runtime_device(3, dev), size);
 67 | //fn @alloc_hsa_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(3, dev), size);
 68 | //fn @alloc_hsa_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(3, dev), size);
 69 | //fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));
 70 | //fn @alloc_pal[T](dev: i32, size: i64) = alloc[T](runtime_device(4, dev), size);
 71 | //fn @alloc_pal_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(4, dev), size);
 72 | //fn @alloc_pal_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(4, dev), size);
 73 | //fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev));
 74 | //
 75 | //fn @copy[T](src: Buffer[T], dst: Buffer[T]) = runtime_copy(src.device, src.data as &[i8], 0, dst.device, dst.data as &mut [i8], 0, src.size);
 76 | //fn @copy_offset[T](src: Buffer[T], off_src: i64, dst: Buffer[T], off_dst: i64, size: i64) = runtime_copy(src.device, src.data as &[i8], off_src, dst.device, dst.data as &mut [i8], off_dst, size);
 77 | 
 78 | struct Buffer {
 79 |     data : &mut [i8],
 80 |     size : i64,
 81 |     device : i32
 82 | }
 83 | 
 84 | fn @alloc(device: i32, size: i64) = Buffer {
 85 |     data = runtime_alloc(device, size),
 86 |     size = size,
 87 |     device = device
 88 | };
 89 | fn @alloc_host(device: i32, size: i64) = Buffer {
 90 |     data = runtime_alloc_host(device, size),
 91 |     size = size,
 92 |     device = device
 93 | };
 94 | fn @alloc_unified(device: i32, size: i64) = Buffer {
 95 |     data = runtime_alloc_unified(device, size),
 96 |     size = size,
 97 |     device = device
 98 | };
 99 | fn @release(buf: Buffer) = runtime_release(buf.device, buf.data);
100 | 
101 | fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
102 | 
103 | fn @alloc_cpu(size: i64) = alloc(0, size);
104 | fn @alloc_cuda(dev: i32, size: i64) = alloc(runtime_device(1, dev), size);
105 | fn @alloc_cuda_host(dev: i32, size: i64) = alloc_host(runtime_device(1, dev), size);
106 | fn @alloc_cuda_unified(dev: i32, size: i64) = alloc_unified(runtime_device(1, dev), size);
107 | fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev));
108 | fn @alloc_opencl(dev: i32, size: i64) = alloc(runtime_device(2, dev), size);
109 | fn @alloc_opencl_unified(dev: i32, size: i64) = alloc_unified(runtime_device(2, dev), size);
110 | fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev));
111 | fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
112 | fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
113 | fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
114 | fn @alloc_hsa(dev: i32, size: i64) = alloc(runtime_device(3, dev), size);
115 | fn @alloc_hsa_host(dev: i32, size: i64) = alloc_host(runtime_device(3, dev), size);
116 | fn @alloc_hsa_unified(dev: i32, size: i64) = alloc_unified(runtime_device(3, dev), size);
117 | fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));
118 | fn @alloc_pal(dev: i32, size: i64) = alloc(runtime_device(4, dev), size);
119 | fn @alloc_pal_host(dev: i32, size: i64) = alloc_host(runtime_device(4, dev), size);
120 | fn @alloc_pal_unified(dev: i32, size: i64) = alloc_unified(runtime_device(4, dev), size);
121 | fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev));
122 | fn @alloc_levelzero(dev: i32, size: i64) = alloc(runtime_device(5, dev), size);
123 | fn @alloc_levelzero_host(dev: i32, size: i64) = alloc_host(runtime_device(5, dev), size);
124 | fn @alloc_levelzero_unified(dev: i32, size: i64) = alloc_unified(runtime_device(5, dev), size);
125 | fn @synchronize_levelzero(dev: i32) = runtime_synchronize(runtime_device(5, dev));
126 | 
127 | fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size);
128 | fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size);
129 | 
130 | 
131 | // range, range_step, unroll, unroll_step, etc.
132 | fn @unroll_step(body: fn(i32) -> ()) {
133 |     fn @(?beg & ?end & ?step) loop(beg: i32, end: i32, step: i32) -> () {
134 |         if beg < end {
135 |             @body(beg);
136 |             loop(beg + step, end, step)
137 |         }
138 |     }
139 |     loop
140 | }
141 | 
142 | fn @unroll_step_rev(body: fn(i32) -> ()) {
143 |     fn @(?beg & ?end & ?step) loop(end: i32, beg: i32, step: i32) -> () {
144 |         if end > beg {
145 |             @body(end);
146 |             loop(end - step, beg, step)
147 |         }
148 |     }
149 |     loop
150 | }
151 | 
152 | fn @range(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)($lower, $upper, 1);
153 | fn @range_step(body: fn(i32) -> ()) = @|lower: i32, upper: i32, step: i32| unroll_step(body)($lower, $upper, step);
154 | fn @range_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1);
155 | 
156 | fn @unroll(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)(lower, upper, 1);
157 | fn @unroll_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1);
158 | 


--------------------------------------------------------------------------------
/platforms/impala/intrinsics.impala:
--------------------------------------------------------------------------------
 1 | struct WorkItem {
 2 |     tidx  : fn() -> i32,
 3 |     tidy  : fn() -> i32,
 4 |     tidz  : fn() -> i32,
 5 |     bidx  : fn() -> i32,
 6 |     bidy  : fn() -> i32,
 7 |     bidz  : fn() -> i32,
 8 |     gidx  : fn() -> i32,
 9 |     gidy  : fn() -> i32,
10 |     gidz  : fn() -> i32,
11 |     bdimx : fn() -> i32,
12 |     bdimy : fn() -> i32,
13 |     bdimz : fn() -> i32,
14 |     gdimx : fn() -> i32,
15 |     gdimy : fn() -> i32,
16 |     gdimz : fn() -> i32,
17 |     nblkx : fn() -> i32,
18 |     nblky : fn() -> i32,
19 |     nblkz : fn() -> i32
20 | }
21 | 
22 | struct Accelerator {
23 |     exec          : fn((i32, i32, i32), // grid
24 |                        (i32, i32, i32), // block
25 |                        fn(WorkItem) -> ()) -> (),
26 |     sync          : fn() -> (),
27 |     alloc         : fn(i64) -> Buffer,
28 |     alloc_unified : fn(i64) -> Buffer,
29 |     barrier       : fn() -> ()
30 | }
31 | 
32 | struct Intrinsics {
33 |     expf      : fn(f32) -> f32,
34 |     exp2f     : fn(f32) -> f32,
35 |     logf      : fn(f32) -> f32,
36 |     log2f     : fn(f32) -> f32,
37 |     powf      : fn(f32, f32) -> f32,
38 |     rsqrtf    : fn(f32) -> f32,
39 |     sqrtf     : fn(f32) -> f32,
40 |     fabsf     : fn(f32) -> f32,
41 |     sinf      : fn(f32) -> f32,
42 |     cosf      : fn(f32) -> f32,
43 |     tanf      : fn(f32) -> f32,
44 |     asinf     : fn(f32) -> f32,
45 |     acosf     : fn(f32) -> f32,
46 |     atanf     : fn(f32) -> f32,
47 |     erff      : fn(f32) -> f32,
48 |     atan2f    : fn(f32, f32) -> f32,
49 |     copysignf : fn(f32, f32) -> f32,
50 |     fmaf      : fn(f32, f32, f32) -> f32,
51 |     fmaxf     : fn(f32, f32) -> f32,
52 |     fminf     : fn(f32, f32) -> f32,
53 |     fmodf     : fn(f32, f32) -> f32,
54 |     floorf    : fn(f32) -> f32,
55 |     isinff    : fn(f32) -> i32,
56 |     isnanf    : fn(f32) -> i32,
57 |     isfinitef : fn(f32) -> i32,
58 |     exp       : fn(f64) -> f64,
59 |     exp2      : fn(f64) -> f64,
60 |     log       : fn(f64) -> f64,
61 |     log2      : fn(f64) -> f64,
62 |     pow       : fn(f64, f64) -> f64,
63 |     rsqrt     : fn(f64) -> f64,
64 |     sqrt      : fn(f64) -> f64,
65 |     fabs      : fn(f64) -> f64,
66 |     sin       : fn(f64) -> f64,
67 |     cos       : fn(f64) -> f64,
68 |     tan       : fn(f64) -> f64,
69 |     asin      : fn(f64) -> f64,
70 |     acos      : fn(f64) -> f64,
71 |     atan      : fn(f64) -> f64,
72 |     erf       : fn(f64) -> f64,
73 |     atan2     : fn(f64, f64) -> f64,
74 |     copysign  : fn(f64, f64) -> f64,
75 |     fma       : fn(f64, f64, f64) -> f64,
76 |     fmax      : fn(f64, f64) -> f64,
77 |     fmin      : fn(f64, f64) -> f64,
78 |     fmod      : fn(f64, f64) -> f64,
79 |     floor     : fn(f64) -> f64,
80 |     isinf     : fn(f64) -> i32,
81 |     isnan     : fn(f64) -> i32,
82 |     isfinite  : fn(f64) -> i32,
83 |     min       : fn(i32, i32) -> i32,
84 |     max       : fn(i32, i32) -> i32,
85 | }
86 | 


--------------------------------------------------------------------------------
/platforms/impala/intrinsics_cpu.impala:
--------------------------------------------------------------------------------
  1 | extern "C" {
  2 |     //fn "sinf"             cpu_sinf(f32) -> f32;
  3 |     //fn "cosf"             cpu_cosf(f32) -> f32;
  4 |     fn "tanf"             cpu_tanf(f32) -> f32;
  5 |     fn "asinf"            cpu_asinf(f32) -> f32;
  6 |     fn "acosf"            cpu_acosf(f32) -> f32;
  7 |     fn "atanf"            cpu_atanf(f32) -> f32;
  8 |     fn "erff"             cpu_erff(f32) -> f32;
  9 |     fn "fmodf"            cpu_fmodf(f32, f32) -> f32;
 10 |     fn "atan2f"           cpu_atan2f(f32, f32) -> f32;
 11 |     fn "anydsl_isinff"    cpu_isinff(f32) -> i32;
 12 |     fn "anydsl_isnanf"    cpu_isnanf(f32) -> i32;
 13 |     fn "anydsl_isfinitef" cpu_isfinitef(f32) -> i32;
 14 |     //fn "sin"              cpu_sin(f64) -> f64;
 15 |     //fn "cos"              cpu_cos(f64) -> f64;
 16 |     fn "tan"              cpu_tan(f64) -> f64;
 17 |     fn "asin"             cpu_asin(f64) -> f64;
 18 |     fn "acos"             cpu_acos(f64) -> f64;
 19 |     fn "atan"             cpu_atan(f64) -> f64;
 20 |     fn "erf"              cpu_erf(f64) -> f64;
 21 |     fn "fmod"             cpu_fmod(f64, f64) -> f64;
 22 |     fn "atan2"            cpu_atan2(f64, f64) -> f64;
 23 |     fn "anydsl_isinf"     cpu_isinf(f64) -> i32;
 24 |     fn "anydsl_isnan"     cpu_isnan(f64) -> i32;
 25 |     fn "anydsl_isfinite"  cpu_isfinite(f64) -> i32;
 26 | }
 27 | 
 28 | extern "device" {
 29 |     fn "llvm.exp.f32"         cpu_expf(f32) -> f32;
 30 |     fn "llvm.exp2.f32"        cpu_exp2f(f32) -> f32;
 31 |     fn "llvm.log.f32"         cpu_logf(f32) -> f32;
 32 |     fn "llvm.log2.f32"        cpu_log2f(f32) -> f32;
 33 |     fn "llvm.pow.f32"         cpu_powf(f32, f32) -> f32;
 34 |     fn "llvm.sqrt.f32"        cpu_sqrtf(f32) -> f32;
 35 |     fn "llvm.fabs.f32"        cpu_fabsf(f32) -> f32;
 36 |     fn "llvm.sin.f32"         cpu_sinf(f32) -> f32;
 37 |     fn "llvm.cos.f32"         cpu_cosf(f32) -> f32;
 38 |     fn "llvm.floor.f32"       cpu_floorf(f32) -> f32;
 39 |     fn "llvm.fma.f32"         cpu_fmaf(f32, f32, f32) -> f32;
 40 |     fn "llvm.fmuladd.f32"     cpu_madf(f32, f32, f32) -> f32;
 41 |     fn "llvm.copysign.f32"    cpu_copysignf(f32, f32) -> f32;
 42 |     fn "llvm.minnum.f32"      cpu_fminf(f32, f32) -> f32;
 43 |     fn "llvm.maxnum.f32"      cpu_fmaxf(f32, f32) -> f32;
 44 |     fn "llvm.exp.f64"         cpu_exp(f64) -> f64;
 45 |     fn "llvm.exp2.f64"        cpu_exp2(f64) -> f64;
 46 |     fn "llvm.log.f64"         cpu_log(f64) -> f64;
 47 |     fn "llvm.log2.f64"        cpu_log2(f64) -> f64;
 48 |     fn "llvm.pow.f64"         cpu_pow(f64, f64) -> f64;
 49 |     fn "llvm.sqrt.f64"        cpu_sqrt(f64) -> f64;
 50 |     fn "llvm.fabs.f64"        cpu_fabs(f64) -> f64;
 51 |     fn "llvm.sin.f64"         cpu_sin(f64) -> f64;
 52 |     fn "llvm.cos.f64"         cpu_cos(f64) -> f64;
 53 |     fn "llvm.floor.f64"       cpu_floor(f64) -> f64;
 54 |     fn "llvm.fma.f64"         cpu_fma(f64, f64, f64) -> f64;
 55 |     fn "llvm.fmuladd.f64"     cpu_mad(f64, f64, f64) -> f64;
 56 |     fn "llvm.copysign.f64"    cpu_copysign(f64, f64) -> f64;
 57 |     fn "llvm.minnum.f64"      cpu_fmin(f64, f64) -> f64;
 58 |     fn "llvm.maxnum.f64"      cpu_fmax(f64, f64) -> f64;
 59 |     fn "llvm.ctpop.i32"       cpu_popcount32(i32) -> i32;
 60 |     fn "llvm.ctpop.i64"       cpu_popcount64(i64) -> i64;
 61 |     fn "llvm.ctlz.i32"        cpu_clz32(i32, bool) -> i32;
 62 |     fn "llvm.ctlz.i64"        cpu_clz64(i64, bool) -> i64;
 63 |     fn "llvm.cttz.i32"        cpu_ctz32(i32, bool) -> i32;
 64 |     fn "llvm.cttz.i64"        cpu_ctz64(i64, bool) -> i64;
 65 |     fn "llvm.x86.bmi.pext.32" cpu_pext32(i32, i32) -> i32;
 66 |     fn "llvm.x86.bmi.pext.64" cpu_pext64(i64, i64) -> i64;
 67 |     fn "llvm.x86.bmi.pdep.32" cpu_pdep32(i32, i32) -> i32;
 68 |     fn "llvm.x86.bmi.pdep.64" cpu_pdep64(i64, i64) -> i64;
 69 |     fn "llvm.prefetch.p0"   cpu_prefetch(&u8, i32, i32, i32) -> ();
 70 | }
 71 | 
 72 | //
 73 | // atomics
 74 | //            0    1   2   3   4    5  6   7   8   9    10   11   12
 75 | // operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
 76 | //            0         1         2         4       5       6              7
 77 | // ordering:  NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
 78 | // syncscope: singlethread "" (system)
 79 | //
 80 | 
 81 | fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 { atomic(0u32, a, b, 7u32, "") }
 82 | fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32  { atomic(1u32, a, b, 7u32, "") }
 83 | fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32  { atomic(2u32, a, b, 7u32, "") }
 84 | fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32  { atomic(7u32, a, b, 7u32, "") }
 85 | fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32  { atomic(8u32, a, b, 7u32, "") }
 86 | 
 87 | static cpu_intrinsics = Intrinsics {
 88 |     expf        : cpu_expf,
 89 |     exp2f       : cpu_exp2f,
 90 |     logf        : cpu_logf,
 91 |     log2f       : cpu_log2f,
 92 |     powf        : cpu_powf,
 93 |     rsqrtf      : @|a| { 1.0f / cpu_sqrtf(a) },
 94 |     sqrtf       : cpu_sqrtf,
 95 |     fabsf       : cpu_fabsf,
 96 |     sinf        : cpu_sinf,
 97 |     cosf        : cpu_cosf,
 98 |     tanf        : cpu_tanf,
 99 |     asinf       : cpu_asinf,
100 |     acosf       : cpu_acosf,
101 |     atanf       : cpu_atanf,
102 |     erff        : cpu_erff,
103 |     atan2f      : cpu_atan2f,
104 |     copysignf   : cpu_copysignf,
105 |     fmaf        : cpu_fmaf,
106 |     fmaxf       : cpu_fmaxf,
107 |     fminf       : cpu_fminf,
108 |     fmodf       : cpu_fmodf,
109 |     floorf      : cpu_floorf,
110 |     isinff      : cpu_isinff,
111 |     isnanf      : cpu_isnanf,
112 |     isfinitef   : cpu_isfinitef,
113 |     exp         : cpu_exp,
114 |     exp2        : cpu_exp2,
115 |     log         : cpu_log,
116 |     log2        : cpu_log2,
117 |     pow         : cpu_pow,
118 |     rsqrt       : @|a| { 1.0 / cpu_sqrt(a) },
119 |     sqrt        : cpu_sqrt,
120 |     fabs        : cpu_fabs,
121 |     sin         : cpu_sin,
122 |     cos         : cpu_cos,
123 |     tan         : cpu_tan,
124 |     asin        : cpu_asin,
125 |     acos        : cpu_acos,
126 |     atan        : cpu_atan,
127 |     erf         : cpu_erf,
128 |     atan2       : cpu_atan2,
129 |     copysign    : cpu_copysign,
130 |     fma         : cpu_fma,
131 |     fmax        : cpu_fmax,
132 |     fmin        : cpu_fmin,
133 |     fmod        : cpu_fmod,
134 |     floor       : cpu_floor,
135 |     isinf       : cpu_isinf,
136 |     isnan       : cpu_isnan,
137 |     isfinite    : cpu_isfinite,
138 |     min         : @|a, b| { if a < b { a } else { b } },
139 |     max         : @|a, b| { if a > b { a } else { b } },
140 | };
141 | 


--------------------------------------------------------------------------------
/platforms/impala/intrinsics_hls.impala:
--------------------------------------------------------------------------------
  1 | extern "device" {
  2 |     // no declarations are emitted for "device" functions
  3 |     fn "exp"        hls_expf(f32) -> f32;
  4 |     fn "exp2"       hls_exp2f(f32) -> f32;
  5 |     fn "log"        hls_logf(f32) -> f32;
  6 |     fn "log2"       hls_log2f(f32) -> f32;
  7 |     fn "pow"        hls_powf(f32, f32) -> f32;
  8 |     fn "rsqrt"      hls_rsqrtf(f32) -> f32;
  9 |     fn "sqrt"       hls_sqrtf(f32) -> f32;
 10 |     fn "fabs"       hls_fabsf(f32) -> f32;
 11 |     fn "sin"        hls_sinf(f32) -> f32;
 12 |     fn "cos"        hls_cosf(f32) -> f32;
 13 |     fn "tan"        hls_tanf(f32) -> f32;
 14 |     fn "asin"       hls_asinf(f32) -> f32;
 15 |     fn "acos"       hls_acosf(f32) -> f32;
 16 |     fn "atan"       hls_atanf(f32) -> f32;
 17 |     fn "erf"        hls_erff(f32) -> f32;
 18 |     fn "atan2"      hls_atan2f(f32, f32) -> f32;
 19 |     fn "fmod"       hls_fmodf(f32, f32) -> f32;
 20 |     fn "floor"      hls_floorf(f32) -> f32;
 21 |     fn "isinf"      hls_isinff(f32) -> i32;
 22 |     fn "isnan"      hls_isnanf(f32) -> i32;
 23 |     fn "isfinite"   hls_isfinitef(f32) -> i32;
 24 |     fn "fma"        hls_fmaf(f32, f32, f32) -> f32;
 25 |     fn "mad"        hls_madf(f32, f32, f32) -> f32;
 26 |     fn "copysign"   hls_copysignf(f32, f32) -> f32;
 27 |     fn "exp"        hls_exp(f64) -> f64;
 28 |     fn "exp2"       hls_exp2(f64) -> f64;
 29 |     fn "log"        hls_log(f64) -> f64;
 30 |     fn "log2"       hls_log2(f64) -> f64;
 31 |     fn "pow"        hls_pow(f64, f64) -> f64;
 32 |     fn "rsqrt"      hls_rsqrt(f64) -> f64;
 33 |     fn "sqrt"       hls_sqrt(f64) -> f64;
 34 |     fn "fabs"       hls_fabs(f64) -> f64;
 35 |     fn "sin"        hls_sin(f64) -> f64;
 36 |     fn "cos"        hls_cos(f64) -> f64;
 37 |     fn "tan"        hls_tan(f64) -> f64;
 38 |     fn "asin"       hls_asin(f64) -> f64;
 39 |     fn "acos"       hls_acos(f64) -> f64;
 40 |     fn "atan"       hls_atan(f64) -> f64;
 41 |     fn "erf"        hls_erf(f64) -> f64;
 42 |     fn "atan2"      hls_atan2(f64, f64) -> f64;
 43 |     fn "fmod"       hls_fmod(f64, f64) -> f64;
 44 |     fn "floor"      hls_floor(f64) -> f64;
 45 |     fn "isinf"      hls_isinf(f64) -> i32;
 46 |     fn "isnan"      hls_isnan(f64) -> i32;
 47 |     fn "isfinite"   hls_isfinite(f64) -> i32;
 48 |     fn "fma"        hls_fma(f64, f64, f64) -> f64;
 49 |     fn "mad"        hls_mad(f64, f64, f64) -> f64;
 50 |     fn "copysign"   hls_copysign(f64, f64) -> f64;
 51 |     fn "fmin"       hls_fminf(f32, f32) -> f32;
 52 |     fn "fmax"       hls_fmaxf(f32, f32) -> f32;
 53 |     fn "fmin"       hls_fmin(f64, f64) -> f64;
 54 |     fn "fmax"       hls_fmax(f64, f64) -> f64;
 55 |     fn "min"        hls_min(i32, i32) -> i32;
 56 |     fn "max"        hls_max(i32, i32) -> i32;
 57 | }
 58 | 
 59 | // channel scalar types
 60 | struct channel_u8  { data : u8  };
 61 | struct channel_i32 { data : i32 };
 62 | struct channel_f32 { data : f32 };
 63 | 
 64 | // channel array types
 65 | struct channel1_u8   { data : [u8 * 1  ] };
 66 | struct channel2_u8   { data : [u8 * 2  ] };
 67 | struct channel4_u8   { data : [u8 * 4  ] };
 68 | struct channel8_u8   { data : [u8 * 8  ] };
 69 | struct channel16_u8  { data : [u8 * 16 ] };
 70 | struct channel32_u8  { data : [u8 * 32 ] };
 71 | struct channel64_u8  { data : [u8 * 64 ] };
 72 | struct channel128_u8 { data : [u8 * 128] };
 73 | 
 74 | struct channel1_i32   { data : [i32 * 1  ] };
 75 | struct channel2_i32   { data : [i32 * 2  ] };
 76 | struct channel4_i32   { data : [i32 * 4  ] };
 77 | struct channel8_i32   { data : [i32 * 8  ] };
 78 | struct channel16_i32  { data : [i32 * 16 ] };
 79 | struct channel32_i32  { data : [i32 * 32 ] };
 80 | struct channel64_i32  { data : [i32 * 64 ] };
 81 | struct channel128_i32 { data : [i32 * 128] };
 82 | 
 83 | struct channel1_f32   { data : [f32 * 1  ] };
 84 | struct channel2_f32   { data : [f32 * 2  ] };
 85 | struct channel4_f32   { data : [f32 * 4  ] };
 86 | struct channel8_f32   { data : [f32 * 8  ] };
 87 | struct channel16_f32  { data : [f32 * 16 ] };
 88 | struct channel32_f32  { data : [f32 * 32 ] };
 89 | struct channel64_f32  { data : [f32 * 64 ] };
 90 | struct channel128_f32 { data : [f32 * 128] };
 91 | 
 92 | extern "device" {
 93 |     fn print_pragma(&[u8]) -> ();
 94 |     // u8 scalar
 95 |     fn "read_channel"  read_channel_u8  ( &mut channel_u8    ) -> u8 ;
 96 |     fn "write_channel" write_channel_u8 ( &mut channel_u8, u8) -> ();
 97 | 
 98 |     // u8 array
 99 |     fn "read_channel"  read_channel1_u8  ( &mut channel1_u8  ) -> [u8 * 1  ];
100 |     fn "read_channel"  read_channel2_u8  ( &mut channel2_u8  ) -> [u8 * 2  ];
101 |     fn "read_channel"  read_channel4_u8  ( &mut channel4_u8  ) -> [u8 * 4  ];
102 |     fn "read_channel"  read_channel8_u8  ( &mut channel8_u8  ) -> [u8 * 8  ];
103 |     fn "read_channel"  read_channel16_u8 ( &mut channel16_u8 ) -> [u8 * 16 ];
104 |     fn "read_channel"  read_channel32_u8 ( &mut channel32_u8 ) -> [u8 * 32 ];
105 |     fn "read_channel"  read_channel64_u8 ( &mut channel64_u8 ) -> [u8 * 64 ];
106 |     fn "read_channel"  read_channel128_u8( &mut channel128_u8) -> [u8 * 128];
107 | 
108 |     fn "write_channel" write_channel1_u8  ( &mut channel1_u8,   [u8 * 1  ] ) -> ();
109 |     fn "write_channel" write_channel2_u8  ( &mut channel2_u8,   [u8 * 2  ] ) -> ();
110 |     fn "write_channel" write_channel4_u8  ( &mut channel4_u8,   [u8 * 4  ] ) -> ();
111 |     fn "write_channel" write_channel8_u8  ( &mut channel8_u8,   [u8 * 8  ] ) -> ();
112 |     fn "write_channel" write_channel16_u8 ( &mut channel16_u8,  [u8 * 16 ] ) -> ();
113 |     fn "write_channel" write_channel32_u8 ( &mut channel32_u8,  [u8 * 32 ] ) -> ();
114 |     fn "write_channel" write_channel64_u8 ( &mut channel64_u8,  [u8 * 64 ] ) -> ();
115 |     fn "write_channel" write_channel128_u8( &mut channel128_u8, [u8 * 128] ) -> ();
116 |     fn " "             bitcast_channel_u8 ( &mut channel1_u8) -> [u8 * 2 ];
117 | 
118 |     // i32 scalar
119 |     fn "read_channel"  read_channel_i32  ( &mut channel_i32      ) -> i32;
120 |     fn "write_channel" write_channel_i32 ( &mut channel_i32, i32 ) -> ();
121 | 
122 |     // i32 array
123 |     fn "read_channel"  read_channel1_i32  ( &mut channel1_i32  ) -> [i32 * 1  ];
124 |     fn "read_channel"  read_channel2_i32  ( &mut channel2_i32  ) -> [i32 * 2  ];
125 |     fn "read_channel"  read_channel4_i32  ( &mut channel4_i32  ) -> [i32 * 4  ];
126 |     fn "read_channel"  read_channel8_i32  ( &mut channel8_i32  ) -> [i32 * 8  ];
127 |     fn "read_channel"  read_channel16_i32 ( &mut channel16_i32 ) -> [i32 * 16 ];
128 |     fn "read_channel"  read_channel32_i32 ( &mut channel32_i32 ) -> [i32 * 32 ];
129 |     fn "read_channel"  read_channel64_i32 ( &mut channel64_i32 ) -> [i32 * 64 ];
130 |     fn "read_channel"  read_channel128_i32( &mut channel128_i32) -> [i32 * 128];
131 | 
132 |     fn "write_channel" write_channel1_i32  ( &mut channel1_i32,   [i32 * 1  ] )-> ();
133 |     fn "write_channel" write_channel2_i32  ( &mut channel2_i32,   [i32 * 2  ] ) -> ();
134 |     fn "write_channel" write_channel4_i32  ( &mut channel4_i32,   [i32 * 4  ] ) -> ();
135 |     fn "write_channel" write_channel8_i32  ( &mut channel8_i32,   [i32 * 8  ] ) -> ();
136 |     fn "write_channel" write_channel16_i32 ( &mut channel16_i32,  [i32 * 16 ] ) -> ();
137 |     fn "write_channel" write_channel32_i32 ( &mut channel32_i32,  [i32 * 32 ] ) -> ();
138 |     fn "write_channel" write_channel64_i32 ( &mut channel64_i32,  [i32 * 64 ] ) -> ();
139 |     fn "write_channel" write_channel128_i32( &mut channel128_i32, [i32 * 128])  -> ();
140 |     fn " "             bitcast_channel_i32 ( &mut channel1_i32) -> [i32 * 2 ];
141 | 
142 |     // f32 scalar
143 |     fn "read_channel"  read_channel_f32  ( &mut channel_f32      ) -> f32;
144 |     fn "write_channel" write_channel_f32 ( &mut channel_f32, f32 ) -> ();
145 | 
146 |     // f32 array
147 |     fn "read_channel"  read_channel1_f32  ( &mut channel1_f32  ) -> [f32 * 1  ];
148 |     fn "read_channel"  read_channel2_f32  ( &mut channel2_f32  ) -> [f32 * 2  ];
149 |     fn "read_channel"  read_channel4_f32  ( &mut channel4_f32  ) -> [f32 * 4  ];
150 |     fn "read_channel"  read_channel8_f32  ( &mut channel8_f32  ) -> [f32 * 8  ];
151 |     fn "read_channel"  read_channel16_f32 ( &mut channel16_f32 ) -> [f32 * 16 ];
152 |     fn "read_channel"  read_channel32_f32 ( &mut channel32_f32 ) -> [f32 * 32 ];
153 |     fn "read_channel"  read_channel64_f32 ( &mut channel64_f32 ) -> [f32 * 64 ];
154 |     fn "read_channel"  read_channel128_f32( &mut channel128_f32) -> [f32 * 128];
155 | 
156 |     fn "write_channel" write_channel1_f32  ( &mut channel1_f32,   [f32 * 1  ]) -> ();
157 |     fn "write_channel" write_channel2_f32  ( &mut channel2_f32,   [f32 * 2  ]) -> ();
158 |     fn "write_channel" write_channel4_f32  ( &mut channel4_f32,   [f32 * 4  ]) -> ();
159 |     fn "write_channel" write_channel8_f32  ( &mut channel8_f32,   [f32 * 8  ]) -> ();
160 |     fn "write_channel" write_channel16_f32 ( &mut channel16_f32,  [f32 * 16 ]) -> ();
161 |     fn "write_channel" write_channel32_f32 ( &mut channel32_f32,  [f32 * 32 ]) -> ();
162 |     fn "write_channel" write_channel64_f32 ( &mut channel64_f32,  [f32 * 64 ]) -> ();
163 |     fn "write_channel" write_channel128_f32( &mut channel128_f32, [f32 * 128]) -> ();
164 |     fn " "             bitcast_channel_f32 ( &mut channel1_f32) -> [f32 * 2  ];
165 | }
166 | 
167 | fn @hls_accelerator(dev: i32) -> Accelerator {
168 |     Accelerator {
169 |         exec          : @|grid, block, body| {
170 |             let work_item = WorkItem {
171 |                 tidx  : @|| 0, tidy  : @|| 0, tidz  : @|| 0,
172 |                 bidx  : @|| 0, bidy  : @|| 0, bidz  : @|| 0,
173 |                 gidx  : @|| 0, gidy  : @|| 0, gidz  : @|| 0,
174 |                 bdimx : @|| 1, bdimy : @|| 1, bdimz : @|| 1,
175 |                 gdimx : @|| 1, gdimy : @|| 1, gdimz : @|| 1,
176 |                 nblkx : @|| 1, nblky : @|| 1, nblkz : @|| 1
177 |             };
178 |             hls(dev, || @@body(work_item));
179 |         },
180 |         sync          : @|| synchronize_hls(dev),
181 |         alloc         : @|size| alloc_hls(dev, size),
182 |         alloc_unified : @|size| alloc_hls_unified(dev, size),
183 |         barrier       : @|| ()
184 |     }
185 | };
186 | 
187 | static hls_intrinsics = Intrinsics {
188 |     expf        : hls_expf,
189 |     exp2f       : hls_exp2f,
190 |     logf        : hls_logf,
191 |     log2f       : hls_log2f,
192 |     powf        : hls_powf,
193 |     rsqrtf      : hls_rsqrtf,
194 |     sqrtf       : hls_sqrtf,
195 |     fabsf       : hls_fabsf,
196 |     sinf        : hls_sinf,
197 |     cosf        : hls_cosf,
198 |     tanf        : hls_tanf,
199 |     asinf       : hls_asinf,
200 |     acosf       : hls_acosf,
201 |     atanf       : hls_atanf,
202 |     erff        : hls_erff,
203 |     atan2f      : hls_atan2f,
204 |     copysignf   : hls_copysignf,
205 |     fmaf        : hls_fmaf,
206 |     fmaxf       : hls_fmaxf,
207 |     fminf       : hls_fminf,
208 |     fmodf       : hls_fmodf,
209 |     floorf      : hls_floorf,
210 |     isinff      : hls_isinff,
211 |     isnanf      : hls_isnanf,
212 |     isfinitef   : hls_isfinitef,
213 |     exp         : hls_exp,
214 |     exp2        : hls_exp2,
215 |     log         : hls_log,
216 |     log2        : hls_log2,
217 |     pow         : hls_pow,
218 |     rsqrt       : hls_rsqrt,
219 |     sqrt        : hls_sqrt,
220 |     fabs        : hls_fabs,
221 |     sin         : hls_sin,
222 |     cos         : hls_cos,
223 |     tan         : hls_tan,
224 |     asin        : hls_asin,
225 |     acos        : hls_acos,
226 |     atan        : hls_atan,
227 |     erf         : hls_erf,
228 |     atan2       : hls_atan2,
229 |     copysign    : hls_copysign,
230 |     fma         : hls_fma,
231 |     fmax        : hls_fmax,
232 |     fmin        : hls_fmin,
233 |     fmod        : hls_fmod,
234 |     floor       : hls_floor,
235 |     isinf       : hls_isinf,
236 |     isnan       : hls_isnan,
237 |     isfinite    : hls_isfinite,
238 |     min         : hls_min,
239 |     max         : hls_max,
240 | };
241 | 


--------------------------------------------------------------------------------
/platforms/impala/intrinsics_opencl.impala:
--------------------------------------------------------------------------------
  1 | extern "device" {
  2 |     // no declarations are emitted for "device" functions
  3 |     fn "barrier"    opencl_barrier(u32) -> ();
  4 |     fn "exp"        opencl_expf(f32) -> f32;
  5 |     fn "exp2"       opencl_exp2f(f32) -> f32;
  6 |     fn "log"        opencl_logf(f32) -> f32;
  7 |     fn "log2"       opencl_log2f(f32) -> f32;
  8 |     fn "pow"        opencl_powf(f32, f32) -> f32;
  9 |     fn "rsqrt"      opencl_rsqrtf(f32) -> f32;
 10 |     fn "sqrt"       opencl_sqrtf(f32) -> f32;
 11 |     fn "fabs"       opencl_fabsf(f32) -> f32;
 12 |     fn "sin"        opencl_sinf(f32) -> f32;
 13 |     fn "cos"        opencl_cosf(f32) -> f32;
 14 |     fn "tan"        opencl_tanf(f32) -> f32;
 15 |     fn "asin"       opencl_asinf(f32) -> f32;
 16 |     fn "acos"       opencl_acosf(f32) -> f32;
 17 |     fn "atan"       opencl_atanf(f32) -> f32;
 18 |     fn "erf"        opencl_erff(f32) -> f32;
 19 |     fn "atan2"      opencl_atan2f(f32, f32) -> f32;
 20 |     fn "fmod"       opencl_fmodf(f32, f32) -> f32;
 21 |     fn "floor"      opencl_floorf(f32) -> f32;
 22 |     fn "isinf"      opencl_isinff(f32) -> i32;
 23 |     fn "isnan"      opencl_isnanf(f32) -> i32;
 24 |     fn "isfinite"   opencl_isfinitef(f32) -> i32;
 25 |     fn "fma"        opencl_fmaf(f32, f32, f32) -> f32;
 26 |     fn "mad"        opencl_madf(f32, f32, f32) -> f32;
 27 |     fn "copysign"   opencl_copysignf(f32, f32) -> f32;
 28 |     fn "exp"        opencl_exp(f64) -> f64;
 29 |     fn "exp2"       opencl_exp2(f64) -> f64;
 30 |     fn "log"        opencl_log(f64) -> f64;
 31 |     fn "log2"       opencl_log2(f64) -> f64;
 32 |     fn "pow"        opencl_pow(f64, f64) -> f64;
 33 |     fn "rsqrt"      opencl_rsqrt(f64) -> f64;
 34 |     fn "sqrt"       opencl_sqrt(f64) -> f64;
 35 |     fn "fabs"       opencl_fabs(f64) -> f64;
 36 |     fn "sin"        opencl_sin(f64) -> f64;
 37 |     fn "cos"        opencl_cos(f64) -> f64;
 38 |     fn "tan"        opencl_tan(f64) -> f64;
 39 |     fn "asin"       opencl_asin(f64) -> f64;
 40 |     fn "acos"       opencl_acos(f64) -> f64;
 41 |     fn "atan"       opencl_atan(f64) -> f64;
 42 |     fn "erf"        opencl_erf(f64) -> f64;
 43 |     fn "atan2"      opencl_atan2(f64, f64) -> f64;
 44 |     fn "fmod"       opencl_fmod(f64, f64) -> f64;
 45 |     fn "floor"      opencl_floor(f64) -> f64;
 46 |     fn "isinf"      opencl_isinf(f64) -> i32;
 47 |     fn "isnan"      opencl_isnan(f64) -> i32;
 48 |     fn "isfinite"   opencl_isfinite(f64) -> i32;
 49 |     fn "fma"        opencl_fma(f64, f64, f64) -> f64;
 50 |     fn "mad"        opencl_mad(f64, f64, f64) -> f64;
 51 |     fn "copysign"   opencl_copysign(f64, f64) -> f64;
 52 |     fn "fmin"       opencl_fminf(f32, f32) -> f32;
 53 |     fn "fmax"       opencl_fmaxf(f32, f32) -> f32;
 54 |     fn "fmin"       opencl_fmin(f64, f64) -> f64;
 55 |     fn "fmax"       opencl_fmax(f64, f64) -> f64;
 56 |     fn "min"        opencl_min(i32, i32) -> i32;
 57 |     fn "max"        opencl_max(i32, i32) -> i32;
 58 |     fn "atomic_add" opencl_atomic_add_global(&mut[1]i32, i32) -> i32;
 59 |     fn "atomic_add" opencl_atomic_add_shared(&mut[3]i32, i32) -> i32;
 60 |     fn "atomic_min" opencl_atomic_min_global(&mut[1]i32, i32) -> i32;
 61 |     fn "atomic_min" opencl_atomic_min_shared(&mut[3]i32, i32) -> i32;
 62 |     fn "get_work_dim"      opencl_get_work_dim() -> u32;
 63 |     fn "get_global_size"   opencl_get_global_size(u32) -> u64;
 64 |     fn "get_global_id"     opencl_get_global_id(u32) -> u64;
 65 |     fn "get_local_size"    opencl_get_local_size(u32) -> u64;
 66 |     fn "get_local_id"      opencl_get_local_id(u32) -> u64;
 67 |     fn "get_num_groups"    opencl_get_num_groups(u32) -> u64;
 68 |     fn "get_group_id"      opencl_get_group_id(u32) -> u64;
 69 |     fn "get_global_offset" opencl_get_global_offset(u32) -> u64;
 70 | }
 71 | 
 72 | fn @opencl_accelerator(dev: i32) -> Accelerator {
 73 |     Accelerator {
 74 |         exec          : @|grid, block, body| {
 75 |             let work_item = WorkItem {
 76 |                 tidx  : @|| opencl_get_local_id(0u32) as i32,
 77 |                 tidy  : @|| opencl_get_local_id(1u32) as i32,
 78 |                 tidz  : @|| opencl_get_local_id(2u32) as i32,
 79 |                 bidx  : @|| opencl_get_group_id(0u32) as i32,
 80 |                 bidy  : @|| opencl_get_group_id(1u32) as i32,
 81 |                 bidz  : @|| opencl_get_group_id(2u32) as i32,
 82 |                 gidx  : @|| opencl_get_global_id(0u32) as i32,
 83 |                 gidy  : @|| opencl_get_global_id(1u32) as i32,
 84 |                 gidz  : @|| opencl_get_global_id(2u32) as i32,
 85 |                 bdimx : @|| opencl_get_local_size(0u32) as i32,
 86 |                 bdimy : @|| opencl_get_local_size(1u32) as i32,
 87 |                 bdimz : @|| opencl_get_local_size(2u32) as i32,
 88 |                 gdimx : @|| opencl_get_global_size(0u32) as i32,
 89 |                 gdimy : @|| opencl_get_global_size(1u32) as i32,
 90 |                 gdimz : @|| opencl_get_global_size(2u32) as i32,
 91 |                 nblkx : @|| opencl_get_num_groups(0u32) as i32,
 92 |                 nblky : @|| opencl_get_num_groups(1u32) as i32,
 93 |                 nblkz : @|| opencl_get_num_groups(2u32) as i32
 94 |             };
 95 |             opencl(dev, grid, block, || @@body(work_item))
 96 |         },
 97 |         sync          : @|| synchronize_opencl(dev),
 98 |         alloc         : @|size| alloc_opencl(dev, size),
 99 |         alloc_unified : @|size| alloc_opencl_unified(dev, size),
100 |         barrier       : @|| opencl_barrier(1u32), // CLK_LOCAL_MEM_FENCE -> 1 // CLK_GLOBAL_MEM_FENCE -> 2
101 |     }
102 | }
103 | 
104 | static opencl_intrinsics = Intrinsics {
105 |     expf        : opencl_expf,
106 |     exp2f       : opencl_exp2f,
107 |     logf        : opencl_logf,
108 |     log2f       : opencl_log2f,
109 |     powf        : opencl_powf,
110 |     rsqrtf      : opencl_rsqrtf,
111 |     sqrtf       : opencl_sqrtf,
112 |     fabsf       : opencl_fabsf,
113 |     sinf        : opencl_sinf,
114 |     cosf        : opencl_cosf,
115 |     tanf        : opencl_tanf,
116 |     asinf       : opencl_asinf,
117 |     acosf       : opencl_acosf,
118 |     atanf       : opencl_atanf,
119 |     erff        : opencl_erff,
120 |     atan2f      : opencl_atan2f,
121 |     copysignf   : opencl_copysignf,
122 |     fmaf        : opencl_fmaf,
123 |     fmaxf       : opencl_fmaxf,
124 |     fminf       : opencl_fminf,
125 |     fmodf       : opencl_fmodf,
126 |     floorf      : opencl_floorf,
127 |     isinff      : opencl_isinff,
128 |     isnanf      : opencl_isnanf,
129 |     isfinitef   : opencl_isfinitef,
130 |     exp         : opencl_exp,
131 |     exp2        : opencl_exp2,
132 |     log         : opencl_log,
133 |     log2        : opencl_log2,
134 |     pow         : opencl_pow,
135 |     rsqrt       : opencl_rsqrt,
136 |     sqrt        : opencl_sqrt,
137 |     fabs        : opencl_fabs,
138 |     sin         : opencl_sin,
139 |     cos         : opencl_cos,
140 |     tan         : opencl_tan,
141 |     asin        : opencl_asin,
142 |     acos        : opencl_acos,
143 |     atan        : opencl_atan,
144 |     erf         : opencl_erf,
145 |     atan2       : opencl_atan2,
146 |     copysign    : opencl_copysign,
147 |     fma         : opencl_fma,
148 |     fmax        : opencl_fmax,
149 |     fmin        : opencl_fmin,
150 |     fmod        : opencl_fmod,
151 |     floor       : opencl_floor,
152 |     isinf       : opencl_isinf,
153 |     isnan       : opencl_isnan,
154 |     isfinite    : opencl_isfinite,
155 |     min         : opencl_min,
156 |     max         : opencl_max,
157 | };
158 | 


--------------------------------------------------------------------------------
/platforms/impala/intrinsics_rv.impala:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 |     fn rv_mask() -> bool;
 3 |     fn rv_any(bool) -> bool;
 4 |     fn rv_all(bool) -> bool;
 5 |     fn rv_ballot(bool) -> i32;
 6 |     fn rv_extract(f32, i32) -> f32;
 7 |     fn rv_insert(f32, i32, f32) -> f32;
 8 |     fn rv_load(&f32, i32) -> f32;
 9 |     fn rv_store(&mut f32, i32, f32) -> (); 
10 |     fn rv_shuffle(f32, i32) -> f32;
11 |     fn rv_align(&i8, i32)-> &i8;
12 |     fn rv_compact(f32, bool) -> f32;
13 |     fn rv_lane_id() -> i32;
14 |     fn rv_num_lanes() -> i32;
15 | }
16 | 


--------------------------------------------------------------------------------
/platforms/impala/intrinsics_thorin.impala:
--------------------------------------------------------------------------------
 1 | extern "thorin" {
 2 |     fn pe_info[T](&[u8], T) -> ();
 3 | 
 4 |     fn alignof[T]() -> i64;
 5 |     fn sizeof[T]() -> i64;
 6 |     fn undef[T]() -> T;
 7 | 
 8 |     fn bitcast[D, S](S) -> D;
 9 |     fn select[T, U](T, U, U) -> U;
10 |     fn insert[T, U](T, i32, U) -> T;
11 |     //fn shuffle[T](T, T, T) -> T;
12 | 
13 |     fn cuda(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
14 |     fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
15 |     fn opencl(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
16 |     fn amdgpu_hsa(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
17 |     fn amdgpu_pal(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
18 |     fn reserve_shared[T](i32) -> &mut[3][T];
19 | 
20 |     fn hls(dev: i32, body: fn() -> ()) -> ();
21 |     fn pipeline(i32, i32, i32, fn(i32) -> ()) -> (); // only for HLS/OpenCL backend
22 |     fn parallel(num_threads: i32, lower: i32, upper: i32, body: fn(i32) -> ()) -> ();
23 |     fn spawn(body: fn() -> ()) -> i32;
24 |     fn sync(id: i32) -> ();
25 | 
26 |     fn atomic[T](binop: u32, addr: &mut T, val: T, order: u32, scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
27 |     fn atomic_load[T](addr: &T, order: u32, scope: &[u8]) -> T;
28 |     fn atomic_store[T](addr: &mut T, val: T, order: u32, scope: &[u8]) -> ();
29 |     fn cmpxchg[T](addr: &mut T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); // only for integer data types
30 |     fn cmpxchg_weak[T](addr: &mut T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); // only for integer data types
31 |     fn fence(order: u32, scope: &[u8]) -> ();
32 | 
33 |     fn "atomic" atomic_p1[T](binop: u32, addr: &mut [1]T, val: T, order: u32, scope: &[u8]) -> T;
34 |     fn "atomic" atomic_p3[T](binop: u32, addr: &mut [3]T, val: T, order: u32, scope: &[u8]) -> T;
35 |     fn "atomic_load" atomic_load_p1[T](addr: &[1]T, order: u32, scope: &[u8]) -> T;
36 |     fn "atomic_load" atomic_load_p3[T](addr: &[3]T, order: u32, scope: &[u8]) -> T;
37 |     fn "atomic_store" atomic_store_p1[T](addr: &mut [1]T, val: T, order: u32, scope: &[u8]) -> ();
38 |     fn "atomic_store" atomic_store_p3[T](addr: &mut [3]T, val: T, order: u32, scope: &[u8]) -> ();
39 |     fn "cmpxchg" cmpxchg_p1[T](addr: &mut [1]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
40 |     fn "cmpxchg" cmpxchg_p3[T](addr: &mut [3]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
41 |     fn "cmpxchg_weak" cmpxchg_weak_p1[T](addr: &mut [1]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
42 |     fn "cmpxchg_weak" cmpxchg_weak_p3[T](addr: &mut [3]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
43 | 
44 |     fn vectorize(vector_length: i32, body: fn(i32) -> ()) -> ();
45 | }
46 | 


--------------------------------------------------------------------------------
/platforms/impala/runtime.impala:
--------------------------------------------------------------------------------
  1 | extern "C" {
  2 |     fn "anydsl_info"                         runtime_info() -> ();
  3 |     fn "anydsl_device_name"                  runtime_device_name(_device: i32) -> &[u8];
  4 |     fn "anydsl_device_check_feature_support" runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool;
  5 | 
  6 |     fn "anydsl_alloc"          runtime_alloc(i32, i64) -> &[i8];
  7 |     fn "anydsl_alloc_host"     runtime_alloc_host(i32, i64) -> &[i8];
  8 |     fn "anydsl_alloc_unified"  runtime_alloc_unified(i32, i64) -> &[i8];
  9 |     fn "anydsl_copy"           runtime_copy(i32, &[i8], i64, i32, &[i8], i64, i64) -> ();
 10 |     fn "anydsl_get_device_ptr" runtime_get_device_ptr(i32, &[i8]) -> &[i8];
 11 |     fn "anydsl_release"        runtime_release(i32, &[i8]) -> ();
 12 |     fn "anydsl_release_host"   runtime_release_host(i32, &[i8]) -> ();
 13 |     fn "anydsl_synchronize"    runtime_synchronize(i32) -> ();
 14 | 
 15 |     fn "anydsl_random_seed"     random_seed(u32) -> ();
 16 |     fn "anydsl_random_val_f32"  random_val_f32() -> f32;
 17 |     fn "anydsl_random_val_u64"  random_val_u64() -> u64;
 18 | 
 19 |     fn "anydsl_get_micro_time"  get_micro_time() -> i64;
 20 |     fn "anydsl_get_nano_time"   get_nano_time() -> i64;
 21 |     fn "anydsl_get_kernel_time" get_kernel_time() -> i64;
 22 | 
 23 |     fn "anydsl_print_i16"    print_i16(i16) -> ();
 24 |     fn "anydsl_print_i32"    print_i32(i32) -> ();
 25 |     fn "anydsl_print_i64"    print_i64(i64) -> ();
 26 |     fn "anydsl_print_u16"    print_u16(u16) -> ();
 27 |     fn "anydsl_print_u32"    print_u32(u32) -> ();
 28 |     fn "anydsl_print_u64"    print_u64(u64) -> ();
 29 |     fn "anydsl_print_f32"    print_f32(f32) -> ();
 30 |     fn "anydsl_print_f64"    print_f64(f64) -> ();
 31 |     fn "anydsl_print_char"   print_char(u8) -> ();
 32 |     fn "anydsl_print_string" print_string(&[u8]) -> ();
 33 |     fn "anydsl_print_flush"  print_flush() -> ();
 34 | }
 35 | 
 36 | struct Buffer {
 37 |     data : &[i8],
 38 |     size : i64,
 39 |     device : i32
 40 | }
 41 | 
 42 | fn @alloc(dev: i32, size: i64) -> Buffer {
 43 |     Buffer {
 44 |         device : dev,
 45 |         data : runtime_alloc(dev, size),
 46 |         size : size
 47 |     }
 48 | }
 49 | fn @alloc_host(dev: i32, size: i64) -> Buffer {
 50 |     Buffer {
 51 |         device : dev,
 52 |         data : runtime_alloc_host(dev, size),
 53 |         size : size
 54 |     }
 55 | }
 56 | fn @alloc_unified(dev: i32, size: i64) -> Buffer {
 57 |     Buffer {
 58 |         device : dev,
 59 |         data : runtime_alloc_unified(dev, size),
 60 |         size : size
 61 |     }
 62 | }
 63 | fn @release(buf: Buffer) -> () { runtime_release(buf.device, buf.data) }
 64 | 
 65 | fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
 66 | 
 67 | fn @alloc_cpu(size: i64) -> Buffer { alloc(0, size) }
 68 | fn @alloc_cuda(dev: i32, size: i64) -> Buffer { alloc(runtime_device(1, dev), size) }
 69 | fn @alloc_cuda_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(1, dev), size) }
 70 | fn @alloc_cuda_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(1, dev), size) }
 71 | fn @synchronize_cuda(dev: i32) -> () { runtime_synchronize(runtime_device(1, dev)) }
 72 | fn @alloc_opencl(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
 73 | fn @alloc_opencl_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
 74 | fn @synchronize_opencl(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
 75 | fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
 76 | fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
 77 | fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
 78 | fn @alloc_hsa(dev: i32, size: i64) -> Buffer { alloc(runtime_device(3, dev), size) }
 79 | fn @alloc_hsa_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(3, dev), size) }
 80 | fn @alloc_hsa_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(3, dev), size) }
 81 | fn @synchronize_hsa(dev: i32) -> () { runtime_synchronize(runtime_device(3, dev)) }
 82 | fn @alloc_pal(dev: i32, size: i64) -> Buffer { alloc(runtime_device(4, dev), size) }
 83 | fn @alloc_pal_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(4, dev), size) }
 84 | fn @alloc_pal_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(4, dev), size) }
 85 | fn @synchronize_pal(dev: i32) -> () { runtime_synchronize(runtime_device(4, dev)) }
 86 | 
 87 | fn @copy(src: Buffer, dst: Buffer) -> () {
 88 |     runtime_copy(src.device, src.data, 0i64, dst.device, dst.data, 0i64, src.size)
 89 | }
 90 | 
 91 | fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) -> () {
 92 |     runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size)
 93 | }
 94 | 
 95 | 
 96 | // range, range_step, unroll, unroll_step, etc.
 97 | fn @(?lower & ?upper & ?step) unroll_step(lower: i32, upper: i32, @step: i32, body: fn(i32) -> ()) -> () {
 98 |     if lower < upper {
 99 |         @@body(lower);
100 |         unroll_step(lower+step, upper, step, body)
101 |     }
102 | }
103 | 
104 | fn @(?upper & ?lower & ?step) unroll_step_rev(upper: i32, lower: i32, @step: i32, body: fn(i32) -> ()) -> () {
105 |     if upper > lower {
106 |         @@body(upper);
107 |         unroll_step_rev(upper-step, lower, step, body)
108 |     }
109 | }
110 | 
111 | fn @range(lower: i32, upper: i32, body: fn(i32) -> ()) -> () { unroll_step($lower, $upper, 1, body) }
112 | fn @range_step(lower: i32, upper: i32, step: i32, body: fn(i32) -> ()) -> () { unroll_step($lower, $upper, step, body) }
113 | fn @range_rev(upper: i32, lower: i32, body: fn(i32) -> ()) -> () { unroll_step_rev($upper, $lower, 1, body) }
114 | 
115 | fn @unroll(lower: i32, upper: i32, body: fn(i32) -> ()) -> () { unroll_step(lower, upper, 1, body) }
116 | fn @unroll_rev(upper: i32, lower: i32, body: fn(i32) -> ()) -> () { unroll_step_rev(upper, lower, 1, body) }
117 | 


--------------------------------------------------------------------------------
/post-patcher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys, re, os
 3 | basename = sys.argv[1]
 4 | def patch_llvmir(rttype):
 5 |     # we need to patch
 6 |     result = []
 7 |     filename = basename+"."+rttype
 8 |     if os.path.isfile(filename):
 9 |         with open(filename) as f:
10 |             for line in f:
11 |                 if rttype=="amdgpu" or rttype=="nvvm" or rttype=="ll":
12 |                     # patch to opaque identity functions
13 |                     m = re.match(r'^declare (.*) @(magic_.*_id)\((.*)\) (?:local_)?unnamed_addr(?: #[0-9]+)?\n$', line)
14 |                     if m is not None:
15 |                         ty1, fname, ty2 = m.groups()
16 |                         assert ty1 == ty2, "Argument and return types of magic IDs must match"
17 |                         print("Patching magic ID {0} in {1}".format(fname, filename))
18 |                         # emit definition instead
19 |                         result.append('define {0} @{1}({0} %name) {{\n'.format(ty1, fname))
20 |                         result.append('  ret {0} %name\n'.format(ty1))
21 |                         result.append('}\n')
22 |                         continue
23 | 
24 |                     result.append(line)
25 |         # we have the patched thing, write it
26 |         with open(filename, "w") as f:
27 |             for line in result:
28 |                 f.write(line)
29 |     return
30 | 
31 | def patch_cfiles(rttype):
32 |     # we need to patch
33 |     channel_line = {}
34 |     channel_type = {}
35 |     result = []
36 |     channel_decl_name = None
37 |     channel_decl_type = None
38 |     channel_decl_line = 0
39 |     if rttype == "cuda":
40 |         filename = basename+"."+"cu"
41 |     elif rttype == "opencl":
42 |         filename = basename+"."+"cl"
43 |     elif rttype == "hls":
44 |         filename = basename+"."+"hls"
45 | 
46 |     if os.path.isfile(filename):
47 |         with open(filename) as f:
48 |             for line in f:
49 |                 # patch to opaque identity functions
50 |                 m = re.match(r'^(.*) = (magic_.*_id)\((.*)\);\n$', line)
51 |                 if m is not None:
52 |                     lhs, fname, arg = m.groups()
53 |                     print("Patching magic ID {0} in {1}".format(fname, filename))
54 |                     # emit definition instead
55 |                     result.append('{0} = {1};\n'.format(lhs, arg))
56 |                 else:
57 |                     result.append(line)
58 | 
59 |         # we have the patched thing, write it
60 |         with open(filename, "w") as f:
61 |             for line in result:
62 |                 f.write(line)
63 |     return
64 | 
65 | def patch_defs(rttype):
66 |     nvvm_defs = {
67 |     }
68 | 
69 |     if rttype == "nvvm":
70 |         result = []
71 |         filename = basename+".nvvm"
72 |         if os.path.isfile(filename):
73 |             with open(filename) as f:
74 |                 for line in f:
75 |                     matched = False
76 | 
77 |                     for (func, code) in iter(nvvm_defs.items()):
78 |                         m = re.match(r'^declare (.*) (@' + func + r')\((.*)\)\n$', line)
79 |                         if m is not None:
80 |                             result.append(code)
81 |                             matched = True
82 |                             break
83 | 
84 |                     if not matched:
85 |                         result.append(line)
86 | 
87 |             with open(filename, "w") as f:
88 |                 for line in result:
89 |                     f.write(line)
90 |     return
91 | 
92 | patch_llvmir("ll")
93 | patch_llvmir("amdgpu")
94 | patch_llvmir("nvvm")
95 | patch_cfiles("cuda")
96 | patch_cfiles("opencl")
97 | patch_cfiles("hls")
98 | patch_defs("nvvm")
99 | 


--------------------------------------------------------------------------------
/src/anydsl_jit.h:
--------------------------------------------------------------------------------
 1 | #ifndef ANYDSL_JIT_H
 2 | #define ANYDSL_JIT_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #include "anydsl_runtime_config.h"
 7 | 
 8 | class Runtime;
 9 | 
10 | AnyDSL_runtime_API Runtime& runtime();
11 | 
12 | #ifdef AnyDSL_runtime_HAS_JIT_SUPPORT
13 | AnyDSL_runtime_jit_API void anydsl_set_cache_directory(const char*);
14 | AnyDSL_runtime_jit_API const char* anydsl_get_cache_directory();
15 | AnyDSL_runtime_jit_API void anydsl_link(const char*);
16 | AnyDSL_runtime_jit_API int32_t anydsl_compile(const char*, uint32_t, uint32_t);
17 | AnyDSL_runtime_jit_API void *anydsl_lookup_function(int32_t, const char*);
18 | AnyDSL_runtime_jit_API void anydsl_set_log_level(uint32_t /* log level (4=error only, 3=warn, 2=info, 1=verbose, 0=debug) */);
19 | #endif
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/anydsl_runtime.cpp:
--------------------------------------------------------------------------------
  1 | #include <random>
  2 | #include <chrono>
  3 | #include <locale>
  4 | #include <mutex>
  5 | #include <sstream>
  6 | 
  7 | #include "anydsl_runtime.h"
  8 | // Make sure the definition for runtime() matches
  9 | // the declaration in anydsl_jit.h
 10 | #include "anydsl_jit.h"
 11 | 
 12 | #include "runtime.h"
 13 | #include "platform.h"
 14 | #include "dummy_platform.h"
 15 | #include "cpu_platform.h"
 16 | 
 17 | #ifdef AnyDSL_runtime_HAS_TBB_SUPPORT
 18 | #define NOMINMAX
 19 | #include <tbb/parallel_for.h>
 20 | #include <tbb/task_arena.h>
 21 | #include <tbb/task_group.h>
 22 | #include <tbb/concurrent_unordered_map.h>
 23 | #include <tbb/concurrent_queue.h>
 24 | #else
 25 | #include <thread>
 26 | #endif
 27 | 
 28 | struct RuntimeSingleton {
 29 |     Runtime runtime;
 30 | 
 31 |     RuntimeSingleton()
 32 |         : runtime(detect_profile_level())
 33 |     {
 34 |         runtime.register_platform<CpuPlatform>();
 35 |         register_cuda_platform(&runtime);
 36 |         register_opencl_platform(&runtime);
 37 |         register_hsa_platform(&runtime);
 38 |         register_pal_platform(&runtime);
 39 |         register_levelzero_platform(&runtime);
 40 |     }
 41 | 
 42 |     static std::pair<ProfileLevel, ProfileLevel> detect_profile_level() {
 43 |         auto profile = std::make_pair(ProfileLevel::None, ProfileLevel::None);
 44 |         const char* env_var = std::getenv("ANYDSL_PROFILE");
 45 |         if (env_var) {
 46 |             std::string env_str = env_var;
 47 |             for (auto& c: env_str)
 48 |                 c = std::toupper(c, std::locale());
 49 |             std::stringstream profile_levels(env_str);
 50 |             std::string level;
 51 |             while (profile_levels >> level) {
 52 |                 if (level == "FULL")
 53 |                     profile.first = ProfileLevel::Full;
 54 |                 else if (level == "FPGA_DYNAMIC")
 55 |                     profile.second = ProfileLevel::Fpga_dynamic;
 56 |             }
 57 |         }
 58 |         return profile;
 59 |     }
 60 | };
 61 | 
 62 | Runtime& runtime() {
 63 |     static RuntimeSingleton singleton;
 64 |     return singleton.runtime;
 65 | }
 66 | 
 67 | inline PlatformId to_platform(int32_t m) {
 68 |     return PlatformId(m & 0x0F);
 69 | }
 70 | 
 71 | inline DeviceId to_device(int32_t m) {
 72 |     return DeviceId(m >> 4);
 73 | }
 74 | 
 75 | void anydsl_info(void) {
 76 |     runtime().display_info();
 77 | }
 78 | 
 79 | const char* anydsl_device_name(int32_t mask) {
 80 |     return runtime().device_name(to_platform(mask), to_device(mask));
 81 | }
 82 | 
 83 | bool anydsl_device_check_feature_support(int32_t mask, const char* feature) {
 84 |     return runtime().device_check_feature_support(to_platform(mask), to_device(mask), feature);
 85 | }
 86 | 
 87 | void* anydsl_alloc(int32_t mask, int64_t size) {
 88 |     return runtime().alloc(to_platform(mask), to_device(mask), size);
 89 | }
 90 | 
 91 | void* anydsl_alloc_host(int32_t mask, int64_t size) {
 92 |     return runtime().alloc_host(to_platform(mask), to_device(mask), size);
 93 | }
 94 | 
 95 | void* anydsl_alloc_unified(int32_t mask, int64_t size) {
 96 |     return runtime().alloc_unified(to_platform(mask), to_device(mask), size);
 97 | }
 98 | 
 99 | void* anydsl_get_device_ptr(int32_t mask, void* ptr) {
100 |     return runtime().get_device_ptr(to_platform(mask), to_device(mask), ptr);
101 | }
102 | 
103 | void anydsl_release(int32_t mask, void* ptr) {
104 |     runtime().release(to_platform(mask), to_device(mask), ptr);
105 | }
106 | 
107 | void anydsl_release_host(int32_t mask, void* ptr) {
108 |     runtime().release_host(to_platform(mask), to_device(mask), ptr);
109 | }
110 | 
111 | void anydsl_copy(
112 |     int32_t mask_src, const void* src, int64_t offset_src,
113 |     int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) {
114 |     runtime().copy(
115 |         to_platform(mask_src), to_device(mask_src), src, offset_src,
116 |         to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size);
117 | }
118 | 
119 | void anydsl_launch_kernel(
120 |     int32_t mask, const char* file_name, const char* kernel_name,
121 |     const uint32_t* grid, const uint32_t* block,
122 |     void** arg_data,
123 |     const uint32_t* arg_sizes,
124 |     const uint32_t* arg_aligns,
125 |     const uint32_t* arg_alloc_sizes,
126 |     const uint8_t* arg_types,
127 |     uint32_t num_args) {
128 |     LaunchParams launch_params = {
129 |         file_name,
130 |         kernel_name,
131 |         grid,
132 |         block,
133 |         {
134 |             arg_data,
135 |             arg_sizes,
136 |             arg_aligns,
137 |             arg_alloc_sizes,
138 |             reinterpret_cast<const KernelArgType*>(arg_types),
139 |         },
140 |         num_args
141 |     };
142 |     runtime().launch_kernel(to_platform(mask), to_device(mask), launch_params);
143 | }
144 | 
145 | void anydsl_synchronize(int32_t mask) {
146 |     runtime().synchronize(to_platform(mask), to_device(mask));
147 | }
148 | 
149 | uint64_t anydsl_get_micro_time() {
150 |     using namespace std::chrono;
151 |     return duration_cast<microseconds>(steady_clock::now().time_since_epoch()).count();
152 | }
153 | 
154 | uint64_t anydsl_get_nano_time() {
155 |     using namespace std::chrono;
156 |     return duration_cast<nanoseconds>(steady_clock::now().time_since_epoch()).count();
157 | }
158 | 
159 | uint64_t anydsl_get_kernel_time() {
160 |     return runtime().kernel_time().load();
161 | }
162 | 
163 | int32_t anydsl_isinff(float x)    { return std::isinf(x); }
164 | int32_t anydsl_isnanf(float x)    { return std::isnan(x); }
165 | int32_t anydsl_isfinitef(float x) { return std::isfinite(x); }
166 | int32_t anydsl_isinf(double x)    { return std::isinf(x); }
167 | int32_t anydsl_isnan(double x)    { return std::isnan(x); }
168 | int32_t anydsl_isfinite(double x) { return std::isfinite(x); }
169 | 
170 | void anydsl_print_i16(int16_t s)  { std::cout << s; }
171 | void anydsl_print_i32(int32_t i)  { std::cout << i; }
172 | void anydsl_print_i64(int64_t l)  { std::cout << l; }
173 | void anydsl_print_u16(uint16_t s) { std::cout << s; }
174 | void anydsl_print_u32(uint32_t i) { std::cout << i; }
175 | void anydsl_print_u64(uint64_t l) { std::cout << l; }
176 | void anydsl_print_f32(float f)    { std::cout << f; }
177 | void anydsl_print_f64(double d)   { std::cout << d; }
178 | void anydsl_print_char(char c)    { std::cout << c; }
179 | void anydsl_print_string(char* s) { std::cout << s; }
180 | void anydsl_print_flush()         { std::cout << std::flush; }
181 | 
182 | void* anydsl_aligned_malloc(size_t size, size_t align) {
183 |     return Runtime::aligned_malloc(size, align);
184 | }
185 | 
186 | void anydsl_aligned_free(void* ptr) {
187 |     return Runtime::aligned_free(ptr);
188 | }
189 | 
190 | #ifndef __has_feature
191 | #define __has_feature(x) 0
192 | #endif
193 | #if (defined (__clang__) && !__has_feature(cxx_thread_local))
194 | #pragma message("Runtime random function is not thread-safe")
195 | static std::mt19937 std_gen;
196 | #else
197 | static thread_local std::mt19937 std_gen;
198 | #endif
199 | static std::uniform_real_distribution<float>   std_dist_f32;
200 | static std::uniform_int_distribution<uint64_t> std_dist_u64;
201 | 
202 | void anydsl_random_seed(uint32_t seed) {
203 |     std_gen.seed(seed);
204 | }
205 | 
206 | float anydsl_random_val_f32() {
207 |     return std_dist_f32(std_gen);
208 | }
209 | 
210 | uint64_t anydsl_random_val_u64() {
211 |     return std_dist_u64(std_gen);
212 | }
213 | 
214 | #ifndef AnyDSL_runtime_HAS_TBB_SUPPORT // C++11 threads version
215 | static std::unordered_map<int32_t, std::thread> thread_pool;
216 | static std::vector<int32_t> free_ids;
217 | static std::mutex thread_lock;
218 | 
219 | void anydsl_parallel_for(int32_t num_threads, int32_t lower, int32_t upper, void* args, void* fun) {
220 |     // Get number of available hardware threads
221 |     if (num_threads == 0) {
222 |         num_threads = std::thread::hardware_concurrency();
223 |         // hardware_concurrency is implementation defined, may return 0
224 |         num_threads = (num_threads == 0) ? 1 : num_threads;
225 |     }
226 | 
227 |     void (*fun_ptr) (void*, int32_t, int32_t) = reinterpret_cast<void (*) (void*, int32_t, int32_t)>(fun);
228 |     const int32_t linear = (upper - lower) / num_threads;
229 | 
230 |     // Create a pool of threads to execute the task
231 |     std::vector<std::thread> pool(num_threads);
232 | 
233 |     for (int i = 0, a = lower, b = lower + linear; i < num_threads - 1; a = b, b += linear, i++) {
234 |         pool[i] = std::thread([=]() {
235 |             fun_ptr(args, a, b);
236 |         });
237 |     }
238 | 
239 |     pool[num_threads - 1] = std::thread([=]() {
240 |         fun_ptr(args, lower + (num_threads - 1) * linear, upper);
241 |     });
242 | 
243 |     // Wait for all the threads to finish
244 |     for (int i = 0; i < num_threads; i++)
245 |         pool[i].join();
246 | }
247 | 
248 | int32_t anydsl_spawn_thread(void* args, void* fun) {
249 |     std::lock_guard<std::mutex> lock(thread_lock);
250 | 
251 |     int32_t (*fun_ptr) (void*) = reinterpret_cast<int32_t (*) (void*)>(fun);
252 | 
253 |     int32_t id;
254 |     if (free_ids.size()) {
255 |         id = free_ids.back();
256 |         free_ids.pop_back();
257 |     } else {
258 |         id = static_cast<int32_t>(thread_pool.size());
259 |     }
260 | 
261 |     auto spawned = std::make_pair(id, std::thread([=](){ fun_ptr(args); }));
262 |     thread_pool.emplace(std::move(spawned));
263 |     return id;
264 | }
265 | 
266 | void anydsl_sync_thread(int32_t id) {
267 |     auto thread = thread_pool.end();
268 |     {
269 |         std::lock_guard<std::mutex> lock(thread_lock);
270 |         thread = thread_pool.find(id);
271 |     }
272 |     if (thread != thread_pool.end()) {
273 |         thread->second.join();
274 |         {
275 |             std::lock_guard<std::mutex> lock(thread_lock);
276 |             free_ids.push_back(thread->first);
277 |             thread_pool.erase(thread);
278 |         }
279 |     } else {
280 |         assert(0 && "Trying to synchronize on invalid thread id");
281 |     }
282 | }
283 | #else // TBB version
284 | void anydsl_parallel_for(int32_t num_threads, int32_t lower, int32_t upper, void* args, void* fun) {
285 |     tbb::task_arena limited((num_threads == 0) ? tbb::task_arena::automatic : num_threads);
286 |     tbb::task_group tg;
287 | 
288 |     void (*fun_ptr) (void*, int32_t, int32_t) = reinterpret_cast<void (*) (void*, int32_t, int32_t)>(fun);
289 | 
290 |     limited.execute([&] {
291 |         tg.run([&] {
292 |             tbb::parallel_for(tbb::blocked_range<int32_t>(lower, upper),
293 |                 [=] (const tbb::blocked_range<int32_t>& range) {
294 |                     fun_ptr(args, range.begin(), range.end());
295 |                 });
296 |         });
297 |     });
298 | 
299 |     limited.execute([&] { tg.wait(); });
300 | }
301 | 
302 | typedef tbb::concurrent_unordered_map<int32_t, tbb::task_group, std::hash<int32_t>> task_group_map;
303 | typedef std::pair<task_group_map::iterator, bool> task_group_node_ref;
304 | static task_group_map task_pool;
305 | static tbb::concurrent_queue<int32_t> free_ids;
306 | static std::mutex thread_lock;
307 | 
308 | int32_t anydsl_spawn_thread(void* args, void* fun) {
309 |     std::lock_guard<std::mutex> lock(thread_lock);
310 |     int32_t id = -1;
311 |     if (!free_ids.try_pop(id)) {
312 |         id = int32_t(task_pool.size());
313 |     }
314 | 
315 |     int32_t(*fun_ptr) (void*) = reinterpret_cast<int32_t(*)(void*)>(fun);
316 | 
317 |     assert(id >= 0);
318 | 
319 |     task_group_node_ref p = task_pool.emplace(std::piecewise_construct, std::forward_as_tuple(id), std::forward_as_tuple());
320 |     tbb::task_group& tg = p.first->second;
321 | 
322 |     tg.run([=] { fun_ptr(args); });
323 | 
324 |     return id;
325 | }
326 | 
327 | void anydsl_sync_thread(int32_t id) {
328 |     auto task = task_pool.end();
329 |     {
330 |         std::lock_guard<std::mutex> lock(thread_lock);
331 |         task = task_pool.find(id);
332 |     }
333 |     if (task != task_pool.end()) {
334 |         task->second.wait();
335 |         {
336 |             std::lock_guard<std::mutex> lock(thread_lock);
337 |             free_ids.push(task->first);
338 |         }
339 |     } else {
340 |         assert(0 && "Trying to synchronize on invalid task id");
341 |     }
342 | }
343 | #endif
344 | 


--------------------------------------------------------------------------------
/src/anydsl_runtime.h:
--------------------------------------------------------------------------------
 1 | #ifndef ANYDSL_RUNTIME_H
 2 | #define ANYDSL_RUNTIME_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdlib.h>
 6 | 
 7 | #include "anydsl_runtime_config.h"
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | 
13 | #define ANYDSL_DEVICE(p, d) ((p) | ((d) << 4))
14 | 
15 | enum {
16 |     ANYDSL_HOST = 0,
17 |     ANYDSL_CUDA = 1,
18 |     ANYDSL_OPENCL = 2,
19 |     ANYDSL_HSA = 3,
20 |     ANYDSL_PAL = 4,
21 |     ANYDSL_LEVELZERO = 5
22 | };
23 | 
24 | AnyDSL_runtime_API void anydsl_info(void);
25 | 
26 | AnyDSL_runtime_API const char* anydsl_device_name(int32_t);
27 | AnyDSL_runtime_API bool anydsl_device_check_feature_support(int32_t, const char*);
28 | 
29 | AnyDSL_runtime_API void* anydsl_alloc(int32_t, int64_t);
30 | AnyDSL_runtime_API void* anydsl_alloc_host(int32_t, int64_t);
31 | AnyDSL_runtime_API void* anydsl_alloc_unified(int32_t, int64_t);
32 | AnyDSL_runtime_API void* anydsl_get_device_ptr(int32_t, void*);
33 | AnyDSL_runtime_API void  anydsl_release(int32_t, void*);
34 | AnyDSL_runtime_API void  anydsl_release_host(int32_t, void*);
35 | 
36 | AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t);
37 | 
38 | AnyDSL_runtime_API void anydsl_launch_kernel(
39 |     int32_t, const char*, const char*,
40 |     const uint32_t*, const uint32_t*,
41 |     void**, const uint32_t*, const uint32_t*, const uint32_t*, const uint8_t*,
42 |     uint32_t);
43 | AnyDSL_runtime_API void anydsl_synchronize(int32_t);
44 | 
45 | AnyDSL_runtime_API void anydsl_random_seed(uint32_t);
46 | AnyDSL_runtime_API float    anydsl_random_val_f32();
47 | AnyDSL_runtime_API uint64_t anydsl_random_val_u64();
48 | 
49 | AnyDSL_runtime_API uint64_t anydsl_get_micro_time();
50 | AnyDSL_runtime_API uint64_t anydsl_get_nano_time();
51 | AnyDSL_runtime_API uint64_t anydsl_get_kernel_time();
52 | 
53 | AnyDSL_runtime_API int32_t anydsl_isinff(float);
54 | AnyDSL_runtime_API int32_t anydsl_isnanf(float);
55 | AnyDSL_runtime_API int32_t anydsl_isfinitef(float);
56 | AnyDSL_runtime_API int32_t anydsl_isinf(double);
57 | AnyDSL_runtime_API int32_t anydsl_isnan(double);
58 | AnyDSL_runtime_API int32_t anydsl_isfinite(double);
59 | 
60 | AnyDSL_runtime_API void anydsl_print_i16(int16_t);
61 | AnyDSL_runtime_API void anydsl_print_i32(int32_t);
62 | AnyDSL_runtime_API void anydsl_print_i64(int64_t);
63 | AnyDSL_runtime_API void anydsl_print_u16(uint16_t);
64 | AnyDSL_runtime_API void anydsl_print_u32(uint32_t);
65 | AnyDSL_runtime_API void anydsl_print_u64(uint64_t);
66 | AnyDSL_runtime_API void anydsl_print_f32(float);
67 | AnyDSL_runtime_API void anydsl_print_f64(double);
68 | AnyDSL_runtime_API void anydsl_print_char(char);
69 | AnyDSL_runtime_API void anydsl_print_string(char*);
70 | AnyDSL_runtime_API void anydsl_print_flush();
71 | 
72 | AnyDSL_runtime_API void* anydsl_aligned_malloc(size_t, size_t);
73 | AnyDSL_runtime_API void anydsl_aligned_free(void*);
74 | 
75 | AnyDSL_runtime_API void anydsl_parallel_for(int32_t, int32_t, int32_t, void*, void*);
76 | AnyDSL_runtime_API int32_t anydsl_spawn_thread(void*, void*);
77 | AnyDSL_runtime_API void anydsl_sync_thread(int32_t);
78 | 
79 | struct AnyDSL_runtime_API Closure {
80 |     void (*fn)(uint64_t);
81 |     uint64_t payload;
82 | };
83 | 
84 | AnyDSL_runtime_API int32_t anydsl_create_graph();
85 | AnyDSL_runtime_API int32_t anydsl_create_task(int32_t, Closure);
86 | AnyDSL_runtime_API void    anydsl_create_edge(int32_t, int32_t);
87 | AnyDSL_runtime_API void    anydsl_execute_graph(int32_t, int32_t);
88 | 
89 | #ifdef __cplusplus
90 | }
91 | #include "anydsl_runtime.hpp"
92 | #endif
93 | 
94 | #endif
95 | 


--------------------------------------------------------------------------------
/src/anydsl_runtime.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef ANYDSL_RUNTIME_HPP
  2 | #define ANYDSL_RUNTIME_HPP
  3 | 
  4 | #ifndef ANYDSL_RUNTIME_H
  5 | #include "anydsl_runtime.h"
  6 | #endif
  7 | 
  8 | namespace anydsl {
  9 | 
 10 | enum class Platform : int32_t {
 11 |     Host = ANYDSL_HOST,
 12 |     Cuda = ANYDSL_CUDA,
 13 |     OpenCL = ANYDSL_OPENCL,
 14 |     HSA = ANYDSL_HSA,
 15 |     PAL = ANYDSL_PAL,
 16 |     LevelZero = ANYDSL_LEVELZERO
 17 | };
 18 | 
 19 | struct Device {
 20 |     Device(int32_t id) : id(id) {}
 21 |     int32_t id;
 22 | };
 23 | 
 24 | inline int32_t make_device(Platform p, Device d) {
 25 |     return ANYDSL_DEVICE((int32_t)p, d.id);
 26 | }
 27 | 
 28 | template <typename T>
 29 | class Array {
 30 | public:
 31 |     Array()
 32 |         : data_(nullptr), size_(0), dev_(0)
 33 |     {}
 34 | 
 35 |     Array(int64_t size)
 36 |         : Array(Platform::Host, Device(0), size)
 37 |     {}
 38 | 
 39 |     Array(int32_t dev, T* ptr, int64_t size)
 40 |         : data_(ptr), size_(size), dev_(dev)
 41 |     {}
 42 | 
 43 |     Array(Platform p, Device d, int64_t size)
 44 |         : dev_(make_device(p, d)) {
 45 |         allocate(size);
 46 |     }
 47 | 
 48 |     Array(Array&& other)
 49 |         : data_(other.data_),
 50 |           size_(other.size_),
 51 |           dev_(other.dev_) {
 52 |         other.data_ = nullptr;
 53 |     }
 54 | 
 55 |     Array& operator = (Array&& other) {
 56 |         deallocate();
 57 |         dev_ = other.dev_;
 58 |         size_ = other.size_;
 59 |         data_ = other.data_;
 60 |         other.data_ = nullptr;
 61 |         return *this;
 62 |     }
 63 | 
 64 |     Array(const Array&) = delete;
 65 |     Array& operator = (const Array&) = delete;
 66 | 
 67 |     ~Array() { deallocate(); }
 68 | 
 69 |     T* begin() { return data_; }
 70 |     const T* begin() const { return data_; }
 71 | 
 72 |     T* end() { return data_ + size_; }
 73 |     const T* end() const { return data_ + size_; }
 74 | 
 75 |     T* data() { return data_; }
 76 |     const T* data() const { return data_; }
 77 | 
 78 |     int64_t size() const { return size_; }
 79 |     int32_t device() const { return dev_; }
 80 | 
 81 |     const T& operator [] (int i) const { return data_[i]; }
 82 |     T& operator [] (int i) { return data_[i]; }
 83 | 
 84 |     T* release() {
 85 |         T* ptr = data_;
 86 |         data_ = nullptr;
 87 |         size_ = 0;
 88 |         dev_ = 0;
 89 |         return ptr;
 90 |     }
 91 | 
 92 | protected:
 93 |     void allocate(int64_t size) {
 94 |         size_ = size;
 95 |         data_ = (T*)anydsl_alloc(dev_, sizeof(T) * size);
 96 |     }
 97 | 
 98 |     void deallocate() {
 99 |         if (data_) anydsl_release(dev_, (void*)data_);
100 |     }
101 | 
102 |     T* data_;
103 |     int64_t size_;
104 |     int32_t dev_;
105 | };
106 | 
107 | template <typename T>
108 | void copy(const Array<T>& a, Array<T>& b) {
109 |     anydsl_copy(a.device(), (const void*)a.data(), 0,
110 |                 b.device(), (void*)b.data(), 0,
111 |                 a.size() * sizeof(T));
112 | }
113 | 
114 | template <typename T>
115 | void copy(const Array<T>& a, Array<T>& b, int64_t size) {
116 |     anydsl_copy(a.device(), (const void*)a.data(), 0,
117 |                 b.device(), (void*)b.data(), 0,
118 |                 size * sizeof(T));
119 | }
120 | 
121 | template <typename T>
122 | void copy(const Array<T>& a, int64_t offset_a, Array<T>& b, int64_t offset_b, int64_t size) {
123 |     anydsl_copy(a.device(), (const void*)a.data(), offset_a * sizeof(T),
124 |                 b.device(), (void*)b.data(), offset_b * sizeof(T),
125 |                 size * sizeof(T));
126 | }
127 | 
128 | } // namespace anydsl
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/src/anydsl_runtime_config.h.in:
--------------------------------------------------------------------------------
 1 | #ifndef ANYDSL_RUNTIME_CONFIG_H
 2 | #define ANYDSL_RUNTIME_CONFIG_H
 3 | 
 4 | // AnyDSL runtime feature support
 5 | 
 6 | #cmakedefine AnyDSL_runtime_BUILD_SHARED
 7 | #cmakedefine AnyDSL_runtime_HAS_LLVM_SUPPORT
 8 | #cmakedefine AnyDSL_runtime_HAS_JIT_SUPPORT
 9 | #cmakedefine AnyDSL_runtime_HAS_CUDA_SUPPORT
10 | #cmakedefine AnyDSL_runtime_HAS_OPENCL_SUPPORT
11 | #cmakedefine AnyDSL_runtime_HAS_LEVELZERO_SUPPORT
12 | #cmakedefine AnyDSL_runtime_HAS_HSA_SUPPORT
13 | #cmakedefine AnyDSL_runtime_HAS_PAL_SUPPORT
14 | #cmakedefine AnyDSL_runtime_HAS_TBB_SUPPORT
15 | 
16 | 
17 | #if defined(AnyDSL_runtime_BUILD_SHARED)
18 | #  ifdef _MSC_VER
19 | #    define __dll_import __declspec(dllimport)
20 | #    define __dll_export __declspec(dllexport)
21 | #  else  // _MSC_VER
22 | #    define __dll_import __attribute__((visibility("default")))
23 | #    define __dll_export __attribute__((visibility("default")))
24 | #  endif // _MSC_VER
25 | #  ifdef AnyDSL_runtime_EXPORTS
26 | #    define AnyDSL_runtime_API __dll_export
27 | #  else  // AnyDSL_runtime_EXPORTS
28 | #    define AnyDSL_runtime_API __dll_import
29 | #  endif // AnyDSL_runtime_EXPORTS
30 | #  ifdef AnyDSL_runtime_jit_EXPORTS
31 | #    define AnyDSL_runtime_jit_API __dll_export
32 | #  else  // AnyDSL_runtime_jit_EXPORTS
33 | #    define AnyDSL_runtime_jit_API __dll_import
34 | #  endif // AnyDSL_runtime_jit_EXPORTS
35 | #else  // AnyDSL_runtime_BUILD_SHARED
36 | #  define AnyDSL_runtime_API
37 | #  define AnyDSL_runtime_jit_API
38 | #endif // AnyDSL_runtime_BUILD_SHARED
39 | 
40 | 
41 | // CUDA support
42 | 
43 | #cmakedefine AnyDSL_runtime_CUDA_CXX_STANDARD     @AnyDSL_runtime_CUDA_CXX_STANDARD@
44 | #define AnyDSL_runtime_LIBDEVICE_LIB        "@AnyDSL_runtime_LIBDEVICE_LIB@"
45 | #define AnyDSL_runtime_NVCC_INC             "@AnyDSL_runtime_NVCC_INC@"
46 | 
47 | // HSA support
48 | 
49 | #define AnyDSL_runtime_HSA_BITCODE_PATH     "@AnyDSL_runtime_HSA_BITCODE_PATH@/"
50 | #define AnyDSL_runtime_HSA_BITCODE_SUFFIX   "@AnyDSL_runtime_HSA_BITCODE_SUFFIX@"
51 | 
52 | // PAL support
53 | 
54 | #define AnyDSL_runtime_PAL_BITCODE_PATH     "@AnyDSL_runtime_PAL_BITCODE_PATH@/"
55 | #define AnyDSL_runtime_PAL_BITCODE_SUFFIX   "@AnyDSL_runtime_PAL_BITCODE_SUFFIX@"
56 | 
57 | // jit support
58 | 
59 | #define AnyDSL_runtime_SOURCE_DIR           "@CMAKE_CURRENT_SOURCE_DIR@"
60 | 
61 | // debug output
62 | 
63 | #cmakedefine AnyDSL_runtime_ENABLE_DEBUG_OUTPUT
64 | 
65 | 
66 | #endif // ANYDSL_RUNTIME_CONFIG_H
67 | 


--------------------------------------------------------------------------------
/src/cpu_platform.cpp:
--------------------------------------------------------------------------------
 1 | #include "cpu_platform.h"
 2 | #include "runtime.h"
 3 | 
 4 | #include <cstddef>
 5 | #include <algorithm>
 6 | #include <fstream>
 7 | #include <iterator>
 8 | #include <string>
 9 | 
10 | #if defined(__APPLE__)
11 | #include <sys/types.h>
12 | #include <sys/sysctl.h>
13 | #elif defined(_WIN32)
14 | #define WIN32_LEAN_AND_MEAN
15 | #define NOMINMAX
16 | #include <windows.h>
17 | #endif
18 | 
19 | CpuPlatform::CpuPlatform(Runtime* runtime)
20 |     : Platform(runtime)
21 | {
22 |     #if defined(__APPLE__)
23 |     size_t buf_len;
24 |     sysctlbyname("machdep.cpu.brand_string", nullptr, &buf_len, nullptr, 0);
25 |     device_name_.resize(buf_len, '\0');
26 |     sysctlbyname("machdep.cpu.brand_string", device_name_.data(), &buf_len, nullptr, 0);
27 |     #elif defined(_WIN32)
28 |     HKEY key;
29 |     if (RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0U, KEY_QUERY_VALUE, &key) != ERROR_SUCCESS)
30 |         error("failed to open processor information registry key");
31 | 
32 |     DWORD cpu_name_type, cpu_name_size;
33 |     if (RegQueryValueExW(key, L"ProcessorNameString", nullptr, &cpu_name_type, nullptr, &cpu_name_size) != ERROR_SUCCESS)
34 |         error("failed to query processor name string length");
35 | 
36 |     if (cpu_name_type != REG_SZ)
37 |         error("unexpected type for processor name string");
38 | 
39 |     int cpu_name_length = cpu_name_size / sizeof(wchar_t);
40 | 
41 |     std::wstring buffer(cpu_name_length, '\0');
42 |     if (RegQueryValueExW(key, L"ProcessorNameString", nullptr, &cpu_name_type, reinterpret_cast<LPBYTE>(buffer.data()), &cpu_name_size) != ERROR_SUCCESS)
43 |         error("failed to query processor name string");
44 | 
45 |     RegCloseKey(key);
46 | 
47 |     int u8_cpu_name_length = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, buffer.data(), cpu_name_length, nullptr, 0, nullptr, nullptr);
48 | 
49 |     if (u8_cpu_name_length <= 0)
50 |         error("failed to compute converted UTF-8 CPU name string length");
51 | 
52 |     device_name_.resize(u8_cpu_name_length, '\0');
53 | 
54 |     if (WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, buffer.data(), cpu_name_length, device_name_.data(), u8_cpu_name_length, nullptr, nullptr) <= 0)
55 |         error("failed to convert CPU name string to UTF-8");
56 |     #else
57 |     std::ifstream cpuinfo("/proc/cpuinfo");
58 | 
59 |     if (!cpuinfo)
60 |         error("failed to open /proc/cpuinfo");
61 | 
62 |     #if defined __arm__ || __aarch64__
63 |     std::string model_string = "CPU part\t: ";
64 |     #else // x86, x86_64
65 |     std::string model_string = "model name\t: ";
66 |     #endif
67 | 
68 |     std::search(std::istreambuf_iterator<char>(cpuinfo), {}, model_string.begin(), model_string.end());
69 |     std::getline(cpuinfo >> std::ws, device_name_);
70 |     #endif
71 | }
72 | 


--------------------------------------------------------------------------------
/src/cpu_platform.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPU_PLATFORM_H
 2 | #define CPU_PLATFORM_H
 3 | 
 4 | #include "platform.h"
 5 | 
 6 | #ifndef PAGE_SIZE
 7 | #define PAGE_SIZE 4096
 8 | #endif
 9 | 
10 | #include <cstring>
11 | 
12 | /// CPU platform, allocation is guaranteed to be aligned to page size: 4096 bytes.
13 | class CpuPlatform : public Platform {
14 | public:
15 |     CpuPlatform(Runtime* runtime);
16 | 
17 | protected:
18 |     void* alloc(DeviceId, int64_t size) override {
19 |         return Runtime::aligned_malloc(size, 32);
20 |     }
21 | 
22 |     void* alloc_host(DeviceId, int64_t size) override {
23 |         return Runtime::aligned_malloc(size, PAGE_SIZE);
24 |     }
25 | 
26 |     void* alloc_unified(DeviceId, int64_t size) override {
27 |         return Runtime::aligned_malloc(size, PAGE_SIZE);
28 |     }
29 | 
30 |     void* get_device_ptr(DeviceId, void* ptr) override {
31 |         return ptr;
32 |     }
33 | 
34 |     void release(DeviceId, void* ptr) override {
35 |         Runtime::aligned_free(ptr);
36 |     }
37 | 
38 |     void release_host(DeviceId dev, void* ptr) override {
39 |         release(dev, ptr);
40 |     }
41 | 
42 |     void no_kernel() {
43 |         error("Kernels are not supported on the CPU");
44 |     }
45 | 
46 |     void launch_kernel(DeviceId, const LaunchParams&) override { no_kernel(); }
47 |     void synchronize(DeviceId) override { no_kernel(); }
48 | 
49 |     void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) {
50 |         memcpy((char*)dst + offset_dst, (char*)src + offset_src, size);
51 |     }
52 | 
53 |     void copy(DeviceId, const void* src, int64_t offset_src,
54 |               DeviceId, void* dst, int64_t offset_dst, int64_t size) override {
55 |         copy(src, offset_src, dst, offset_dst, size);
56 |     }
57 |     void copy_from_host(const void* src, int64_t offset_src, DeviceId,
58 |                         void* dst, int64_t offset_dst, int64_t size) override {
59 |         copy(src, offset_src, dst, offset_dst, size);
60 |     }
61 |     void copy_to_host(DeviceId, const void* src, int64_t offset_src,
62 |                       void* dst, int64_t offset_dst, int64_t size) override {
63 |         copy(src, offset_src, dst, offset_dst, size);
64 |     }
65 | 
66 |     std::string device_name_;
67 |     size_t dev_count() const override { return 1; }
68 |     std::string name() const override { return "CPU"; }
69 |     const char* device_name(DeviceId) const override { return device_name_.c_str(); }
70 |     bool device_check_feature_support(DeviceId, const char*) const override { return false; }
71 | };
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/cuda_platform.h:
--------------------------------------------------------------------------------
  1 | #ifndef CUDA_PLATFORM_H
  2 | #define CUDA_PLATFORM_H
  3 | 
  4 | #include "platform.h"
  5 | #include "runtime.h"
  6 | 
  7 | #include <atomic>
  8 | #include <forward_list>
  9 | #include <mutex>
 10 | #include <string>
 11 | #include <unordered_map>
 12 | #include <vector>
 13 | 
 14 | #define CUDA_API_PER_THREAD_DEFAULT_STREAM
 15 | #include <cuda.h>
 16 | #include <nvrtc.h>
 17 | #include <nvvm.h>
 18 | 
 19 | #if CUDA_VERSION < 10000
 20 |     #error "CUDA 10.0 or higher required!"
 21 | #endif
 22 | 
 23 | /// CUDA platform. Has the same number of devices as that of the CUDA implementation.
 24 | class CudaPlatform : public Platform {
 25 | public:
 26 |     CudaPlatform(Runtime* runtime);
 27 |     ~CudaPlatform();
 28 | 
 29 | protected:
 30 |     void* alloc(DeviceId dev, int64_t size) override;
 31 |     void* alloc_host(DeviceId dev, int64_t size) override;
 32 |     void* alloc_unified(DeviceId dev, int64_t size) override;
 33 |     void* get_device_ptr(DeviceId, void* ptr) override;
 34 |     void release(DeviceId dev, void* ptr) override;
 35 |     void release_host(DeviceId dev, void* ptr) override;
 36 | 
 37 |     void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
 38 |     void synchronize(DeviceId dev) override;
 39 | 
 40 |     void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
 41 |     void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
 42 |     void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
 43 | 
 44 |     size_t dev_count() const override { return devices_.size(); }
 45 |     std::string name() const override { return "CUDA"; }
 46 |     const char* device_name(DeviceId dev) const override;
 47 |     bool device_check_feature_support(DeviceId dev, const char* feature) const override;
 48 | 
 49 |     typedef std::unordered_map<std::string, CUfunction> FunctionMap;
 50 | 
 51 |     struct DeviceData {
 52 |         CUdevice dev;
 53 |         CUcontext ctx;
 54 |         CUjit_target compute_capability;
 55 |         std::atomic_flag locked = ATOMIC_FLAG_INIT;
 56 |         std::unordered_map<std::string, CUmodule> modules;
 57 |         std::unordered_map<CUmodule, FunctionMap> functions;
 58 |         std::string name;
 59 | 
 60 |         DeviceData() {}
 61 |         DeviceData(const DeviceData&) = delete;
 62 |         DeviceData(DeviceData&& data)
 63 |             : dev(data.dev)
 64 |             , ctx(data.ctx)
 65 |             , compute_capability(data.compute_capability)
 66 |             , modules(std::move(data.modules))
 67 |             , functions(std::move(data.functions))
 68 |             , name(std::move(name))
 69 |         {}
 70 | 
 71 |         void lock() {
 72 |             while (locked.test_and_set(std::memory_order_acquire)) ;
 73 |         }
 74 | 
 75 |         void unlock() {
 76 |             locked.clear(std::memory_order_release);
 77 |         }
 78 |     };
 79 | 
 80 |     std::vector<DeviceData> devices_;
 81 | 
 82 |     bool dump_binaries = false;
 83 | 
 84 |     struct ProfileData {
 85 |         CudaPlatform* platform;
 86 |         CUcontext ctx;
 87 |         CUevent start;
 88 |         CUevent end;
 89 |     };
 90 | 
 91 |     std::mutex profile_lock_;
 92 |     std::forward_list<ProfileData*> profiles_;
 93 |     void erase_profiles(bool);
 94 | 
 95 |     CUfunction load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
 96 | 
 97 |     std::string compile_nvptx(DeviceId dev, const std::string& filename, const std::string& program_string) const;
 98 |     std::string compile_nvvm(DeviceId dev, const std::string& filename, const std::string& program_string) const;
 99 |     std::string compile_cuda(DeviceId dev, const std::string& filename, const std::string& program_string) const;
100 |     CUmodule create_module(DeviceId dev, const std::string& filename, const std::string& ptx_string) const;
101 | };
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/src/dummy_platform.h:
--------------------------------------------------------------------------------
 1 | #ifndef DUMMY_PLATFORM_H
 2 | #define DUMMY_PLATFORM_H
 3 | 
 4 | #include "platform.h"
 5 | #include "runtime.h"
 6 | 
 7 | #include <limits>
 8 | 
 9 | /// Dummy platform, implemented
10 | class DummyPlatform : public Platform {
11 | public:
12 |     DummyPlatform(Runtime* runtime, const std::string& name)
13 |         : Platform(runtime), name_(name)
14 |     {}
15 | 
16 | protected:
17 |     void* alloc(DeviceId, int64_t) override { platform_error(); }
18 |     void* alloc_host(DeviceId, int64_t) override { platform_error(); }
19 |     void* alloc_unified(DeviceId, int64_t) override { platform_error(); }
20 |     void* get_device_ptr(DeviceId, void*) override { platform_error(); }
21 |     void release(DeviceId, void*) override { platform_error(); }
22 |     void release_host(DeviceId, void*) override { platform_error(); }
23 | 
24 |     void launch_kernel(DeviceId, const LaunchParams&) override { platform_error(); }
25 |     void synchronize(DeviceId) override { platform_error(); }
26 | 
27 |     void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); }
28 |     void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); }
29 |     void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t) override { platform_error(); }
30 | 
31 |     size_t dev_count() const override { return 0; }
32 |     std::string name() const override { return name_; }
33 |     const char* device_name(DeviceId) const override { return "Dummy"; }
34 |     bool device_check_feature_support(DeviceId, const char*) const override { return false; }
35 | 
36 |     std::string name_;
37 | };
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/src/extract_runtime_srcs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | def main():
 5 |     col, maxcols = 0, 10
 6 |     for f in sys.argv[1:]:
 7 |         with open(f, "r") as fd:
 8 |             for b in fd.read():
 9 |                 sys.stdout.write("{:3}, ".format(ord(b)))
10 |                 col += 1
11 |                 if col == maxcols:
12 |                     sys.stdout.write("\n")
13 |                     col = 0
14 | 
15 | if __name__ == "__main__":
16 |     main()
17 | 


--------------------------------------------------------------------------------
/src/hsa_platform.h:
--------------------------------------------------------------------------------
  1 | #ifndef HSA_PLATFORM_H
  2 | #define HSA_PLATFORM_H
  3 | 
  4 | #include "platform.h"
  5 | #include "runtime.h"
  6 | 
  7 | #include <atomic>
  8 | #include <string>
  9 | #include <unordered_map>
 10 | #include <vector>
 11 | 
 12 | #include <hsa/hsa.h>
 13 | #include <hsa/hsa_ext_amd.h>
 14 | 
 15 | namespace llvm {
 16 | class OptimizationLevel;
 17 | }
 18 | 
 19 | /// HSA platform. Has the same number of devices as that of the HSA implementation.
 20 | class HSAPlatform : public Platform {
 21 | public:
 22 |     HSAPlatform(Runtime* runtime);
 23 |     ~HSAPlatform();
 24 | 
 25 | protected:
 26 |     void* alloc(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].amd_coarsegrained_pool); }
 27 |     void* alloc_host(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].amd_coarsegrained_pool); }
 28 |     void* alloc_unified(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].finegrained_region); }
 29 |     void* get_device_ptr(DeviceId, void* ptr) override { return ptr; }
 30 |     void release(DeviceId dev, void* ptr) override;
 31 |     void release_host(DeviceId dev, void* ptr) override { release(dev, ptr); }
 32 | 
 33 |     void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
 34 |     void synchronize(DeviceId dev) override;
 35 | 
 36 |     void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size);
 37 |     void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
 38 |     void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
 39 |     void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
 40 | 
 41 |     size_t dev_count() const override { return devices_.size(); }
 42 |     std::string name() const override { return "HSA"; }
 43 |     const char* device_name(DeviceId dev) const override;
 44 |     bool device_check_feature_support(DeviceId, const char*) const override { return false; }
 45 | 
 46 |     struct KernelInfo {
 47 |         uint64_t kernel;
 48 |         uint32_t kernarg_segment_size;
 49 |         uint32_t group_segment_size;
 50 |         uint32_t private_segment_size;
 51 |         void*    kernarg_segment;
 52 |     };
 53 | 
 54 |     typedef std::unordered_map<std::string, KernelInfo> KernelMap;
 55 | 
 56 |     struct DeviceData {
 57 |         hsa_agent_t agent;
 58 |         hsa_profile_t profile;
 59 |         hsa_default_float_rounding_mode_t float_mode;
 60 |         std::string isa;
 61 |         hsa_queue_t* queue;
 62 |         hsa_signal_t signal;
 63 |         hsa_region_t kernarg_region, finegrained_region, coarsegrained_region;
 64 |         hsa_amd_memory_pool_t amd_kernarg_pool, amd_finegrained_pool, amd_coarsegrained_pool;
 65 |         std::atomic_flag locked = ATOMIC_FLAG_INIT;
 66 |         std::unordered_map<std::string, hsa_executable_t> programs;
 67 |         std::unordered_map<uint64_t, KernelMap> kernels;
 68 |         std::string name;
 69 | 
 70 |         DeviceData() {}
 71 |         DeviceData(const DeviceData&) = delete;
 72 |         DeviceData(DeviceData&& data)
 73 |             : agent(data.agent)
 74 |             , profile(data.profile)
 75 |             , float_mode(data.float_mode)
 76 |             , isa(data.isa)
 77 |             , queue(data.queue)
 78 |             , signal(data.signal)
 79 |             , kernarg_region(data.kernarg_region)
 80 |             , finegrained_region(data.finegrained_region)
 81 |             , coarsegrained_region(data.coarsegrained_region)
 82 |             , amd_kernarg_pool(data.amd_kernarg_pool)
 83 |             , amd_finegrained_pool(data.amd_finegrained_pool)
 84 |             , amd_coarsegrained_pool(data.amd_finegrained_pool)
 85 |             , programs(std::move(data.programs))
 86 |             , kernels(std::move(data.kernels))
 87 |             , name(data.name)
 88 |         {}
 89 | 
 90 |         void lock() {
 91 |             while (locked.test_and_set(std::memory_order_acquire)) ;
 92 |         }
 93 | 
 94 |         void unlock() {
 95 |             locked.clear(std::memory_order_release);
 96 |         }
 97 |     };
 98 | 
 99 |     uint64_t frequency_;
100 |     std::vector<DeviceData> devices_;
101 | 
102 |     void* alloc_hsa(int64_t, hsa_region_t);
103 |     void* alloc_hsa(int64_t, hsa_amd_memory_pool_t);
104 |     static hsa_status_t iterate_agents_callback(hsa_agent_t, void*);
105 |     static hsa_status_t iterate_regions_callback(hsa_region_t, void*);
106 |     static hsa_status_t iterate_memory_pools_callback(hsa_amd_memory_pool_t, void*);
107 |     KernelInfo& load_kernel(DeviceId, const std::string&, const std::string&);
108 |     std::string compile_gcn(DeviceId, const std::string&, const std::string&) const;
109 |     std::string emit_gcn(const std::string&, const std::string&, const std::string&, llvm::OptimizationLevel) const;
110 | };
111 | 
112 | #endif
113 | 


--------------------------------------------------------------------------------
/src/jit.cpp:
--------------------------------------------------------------------------------
  1 | #include <memory>
  2 | #include <fstream>
  3 | #include <sstream>
  4 | 
  5 | #include <llvm/ExecutionEngine/ExecutionEngine.h>
  6 | #include <llvm/ExecutionEngine/MCJIT.h>
  7 | #include <llvm/ExecutionEngine/RuntimeDyld.h>
  8 | #include <llvm/IR/Module.h>
  9 | #include <llvm/IRReader/IRReader.h>
 10 | #include <llvm/Support/raw_os_ostream.h>
 11 | #include <llvm/TargetParser/Host.h>
 12 | #include <llvm/Support/DynamicLibrary.h>
 13 | #include <llvm/Support/SourceMgr.h>
 14 | #include <llvm/Support/TargetSelect.h>
 15 | 
 16 | #include <thorin/be/codegen.h>
 17 | #include <thorin/be/llvm/cpu.h>
 18 | #include <thorin/world.h>
 19 | 
 20 | #include "anydsl_jit.h"
 21 | #include "log.h"
 22 | #include "runtime.h"
 23 | 
 24 | bool compile(
 25 |     const std::vector<std::string>& file_names,
 26 |     const std::vector<std::string>& file_data,
 27 |     thorin::World& world,
 28 |     std::ostream& error_stream);
 29 | 
 30 | static const char runtime_srcs[] = {
 31 | #include "runtime_srcs.inc"
 32 | 0
 33 | };
 34 | 
 35 | struct JIT {
 36 |     struct Program {
 37 |         Program(llvm::ExecutionEngine* engine) : engine(engine) {}
 38 |         llvm::ExecutionEngine* engine;
 39 |     };
 40 | 
 41 |     std::vector<Program> programs;
 42 |     Runtime* runtime;
 43 |     thorin::LogLevel log_level;
 44 | 
 45 |     JIT(Runtime* runtime) : runtime(runtime), log_level(thorin::LogLevel::Warn) {
 46 |         llvm::InitializeNativeTarget();
 47 |         llvm::InitializeNativeTargetAsmPrinter();
 48 |     }
 49 | 
 50 |     int32_t compile(const char* program_src, uint32_t size, uint32_t opt) {
 51 |         // The LLVM context and module have to be alive for the duration of this function
 52 |         std::unique_ptr<llvm::LLVMContext> llvm_context;
 53 |         std::unique_ptr<llvm::Module> llvm_module;
 54 | 
 55 |         size_t prog_key = std::hash<std::string>{}(program_src);
 56 |         std::stringstream hex_stream;
 57 |         hex_stream << std::hex << prog_key;
 58 |         std::string program_str = std::string(program_src, size);
 59 |         std::string cached_llvm = runtime->load_from_cache(program_str, ".llvm");
 60 |         std::string module_name = "jit_" + hex_stream.str();
 61 |         if (cached_llvm.empty()) {
 62 |             bool debug = false;
 63 |             assert(opt <= 3);
 64 | 
 65 |             thorin::Thorin thorin(module_name);
 66 |             thorin.world().set(log_level);
 67 |             thorin.world().set(std::make_shared<thorin::Stream>(std::cerr));
 68 |             if (!::compile(
 69 |                 { "runtime", module_name },
 70 |                 { std::string(runtime_srcs), program_str },
 71 |                 thorin.world(), std::cerr))
 72 |                 error("JIT: error while compiling sources");
 73 | 
 74 |             thorin.opt();
 75 | 
 76 |             std::string host_triple, host_cpu, host_attr, hls_flags;
 77 |             thorin::DeviceBackends backends(thorin.world(), opt, debug, hls_flags);
 78 | 
 79 |             thorin::llvm::CPUCodeGen cg(thorin, opt, debug, host_triple, host_cpu, host_attr);
 80 |             std::tie(llvm_context, llvm_module) = cg.emit_module();
 81 |             std::stringstream stream;
 82 |             llvm::raw_os_ostream llvm_stream(stream);
 83 |             llvm_module->print(llvm_stream, nullptr);
 84 |             runtime->store_to_cache(program_str, stream.str(), ".llvm");
 85 | 
 86 |             for (auto& cg : backends.cgs) {
 87 |                 if (cg) {
 88 |                     if (std::string(cg->file_ext()) == ".hls")
 89 |                         error("JIT compilation of hls not supported!");
 90 |                     std::ostringstream stream;
 91 |                     cg->emit_stream(stream);
 92 |                     runtime->store_to_cache(cg->file_ext() + program_str, stream.str(), cg->file_ext());
 93 |                     runtime->register_file(module_name + cg->file_ext(), stream.str());
 94 |                 }
 95 |             }
 96 |         } else {
 97 |             llvm::SMDiagnostic diagnostic_err;
 98 |             llvm_context = std::make_unique<llvm::LLVMContext>();
 99 |             llvm_module = llvm::parseIR(llvm::MemoryBuffer::getMemBuffer(cached_llvm)->getMemBufferRef(), diagnostic_err, *llvm_context);
100 | 
101 |             auto load_backend_src = [&](std::string ext) {
102 |                 std::string cached_src = runtime->load_from_cache(ext + program_str, ext);
103 |                 if (!cached_src.empty())
104 |                     runtime->register_file(module_name + ext, cached_src);
105 |             };
106 |             load_backend_src(".cl");
107 |             load_backend_src(".cu");
108 |             load_backend_src(".nvvm");
109 |             load_backend_src(".amdgpu");
110 |         }
111 | 
112 |         llvm::TargetOptions options;
113 |         options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
114 | 
115 |         auto engine = llvm::EngineBuilder(std::move(llvm_module))
116 |             .setEngineKind(llvm::EngineKind::JIT)
117 |             .setMCPU(llvm::sys::getHostCPUName())
118 |             .setTargetOptions(options)
119 |             .setOptLevel(   opt == 0  ? llvm::CodeGenOptLevel::None    :
120 |                             opt == 1  ? llvm::CodeGenOptLevel::Less    :
121 |                             opt == 2  ? llvm::CodeGenOptLevel::Default :
122 |                         /* opt == 3 */ llvm::CodeGenOptLevel::Aggressive)
123 |             .create();
124 |         if (!engine)
125 |             return -1;
126 | 
127 |         engine->finalizeObject();
128 |         programs.push_back(Program(engine));
129 | 
130 |         return (int32_t)programs.size() - 1;
131 |     }
132 | 
133 |     void* lookup_function(int32_t key, const char* fn_name) {
134 |         if (key == -1)
135 |             return nullptr;
136 | 
137 |         return (void *)programs[key].engine->getFunctionAddress(fn_name);
138 |     }
139 | 
140 |     void link(const char* lib) {
141 |         llvm::sys::DynamicLibrary::LoadLibraryPermanently(lib);
142 |     }
143 | };
144 | 
145 | JIT& jit() {
146 |     static std::unique_ptr<JIT> jit(new JIT(&runtime()));
147 |     return *jit;
148 | }
149 | 
150 | void anydsl_set_cache_directory(const char* dir) {
151 |     jit().runtime->set_cache_directory(dir == nullptr ? std::string() : dir);
152 | }
153 | 
154 | const char* anydsl_get_cache_directory() {
155 |     static std::string dir;
156 |     dir = jit().runtime->get_cache_directory();
157 |     return dir.c_str();
158 | }
159 | 
160 | void anydsl_link(const char* lib) {
161 |     jit().link(lib);
162 | }
163 | 
164 | int32_t anydsl_compile(const char* program, uint32_t size, uint32_t opt) {
165 |     return jit().compile(program, size, opt);
166 | }
167 | 
168 | void anydsl_set_log_level(uint32_t log_level) {
169 |     jit().log_level = log_level <= 4 ? static_cast<thorin::LogLevel>(log_level) : thorin::LogLevel::Warn;
170 | }
171 | 
172 | void* anydsl_lookup_function(int32_t key, const char* fn_name) {
173 |     return jit().lookup_function(key, fn_name);
174 | }
175 | 


--------------------------------------------------------------------------------
/src/levelzero_platform.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEVEL_ZERO_PLATFORM_H
 2 | #define LEVEL_ZERO_PLATFORM_H
 3 | 
 4 | #include "platform.h"
 5 | 
 6 | #include <atomic>
 7 | #include <string>
 8 | #include <unordered_map>
 9 | #include <vector>
10 | 
11 | #include <level_zero/ze_api.h>
12 | 
13 | 
14 | /// oneAPI Level Zero platform
15 | class LevelZeroPlatform : public Platform {
16 | public:
17 |     LevelZeroPlatform(Runtime* runtime);
18 |     ~LevelZeroPlatform();
19 | 
20 | protected:
21 |     void* alloc(DeviceId dev, int64_t size) override;
22 |     void* alloc_host(DeviceId, int64_t) override;
23 |     void* alloc_unified(DeviceId, int64_t) override;
24 |     void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
25 |     void release(DeviceId dev, void* ptr) override;
26 |     void release_host(DeviceId, void*) override;
27 | 
28 |     void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
29 |     void synchronize(DeviceId dev) override;
30 | 
31 |     void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
32 |     void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
33 |     void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
34 | 
35 |     size_t dev_count() const override { return devices_.size(); }
36 |     std::string name() const override { return "oneAPI Level Zero"; }
37 |     const char* device_name(DeviceId dev) const override;
38 |     bool device_check_feature_support(DeviceId, const char*) const override { return false; }
39 | 
40 |     typedef std::unordered_map<std::string, ze_kernel_handle_t> KernelMap;
41 | 
42 |     struct DeviceData {
43 |         LevelZeroPlatform* parent;
44 |         ze_driver_handle_t driver;
45 |         ze_device_handle_t device;
46 |         std::string device_name;
47 |         ze_command_list_handle_t queue = nullptr;
48 |         ze_context_handle_t ctx = nullptr;
49 |         std::unordered_map<std::string, ze_module_handle_t> modules;
50 |         std::unordered_map<ze_module_handle_t, KernelMap> kernels;
51 |         double timerResolution;
52 | 
53 |         DeviceData(
54 |             LevelZeroPlatform* parent,
55 |             ze_driver_handle_t driver,
56 |             ze_device_handle_t device,
57 |             const std::string& device_name)
58 |             : parent(parent)
59 |             , driver(driver)
60 |             , device(device)
61 |             , device_name(device_name)
62 |         {}
63 |         DeviceData(DeviceData&&) = default;
64 |         DeviceData(const DeviceData&) = delete;
65 |     };
66 | 
67 |     std::vector<DeviceData> devices_;
68 |     std::vector<ze_context_handle_t> contexts_;
69 | 
70 |     ze_kernel_handle_t load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
71 |     friend void determineDeviceCapabilities(ze_device_handle_t hDevice, LevelZeroPlatform::DeviceData& device);
72 | };
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/log.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOG_H
 2 | #define LOG_H
 3 | 
 4 | #include <cassert>
 5 | #include <cstdlib>
 6 | #include <cstring>
 7 | #include <iostream>
 8 | 
 9 | inline void unused() {}
10 | template <typename T, typename... Args>
11 | inline void unused(const T& t, Args... args) { (void)t; unused(args...); }
12 | 
13 | inline void print(std::ostream& os, const char* fmt) {
14 |     assert(!strchr(fmt, '%') && "Not enough arguments to print");
15 |     os << fmt << std::endl;
16 | }
17 | 
18 | template <typename T, typename... Args>
19 | void print(std::ostream& os, const char* fmt, const T& t, Args... args) {
20 |     auto ptr = strchr(fmt, '%');
21 |     while (ptr && ptr[1] == '%') ptr = strchr(ptr + 2, '%');
22 |     assert(ptr && "Too many arguments to print");
23 |     os.write(fmt, ptr - fmt);
24 |     os << t;
25 |     print(os, ptr + 1, args...);
26 | }
27 | 
28 | template <typename... Args>
29 | [[noreturn]] void error(Args... args) {
30 |     print(std::cerr, args...);
31 |     std::abort();
32 | }
33 | 
34 | template <typename... Args>
35 | void info(Args... args) {
36 |     print(std::cout, args...);
37 | }
38 | 
39 | template <typename... Args>
40 | void debug(Args... args) {
41 | #ifdef AnyDSL_runtime_ENABLE_DEBUG_OUTPUT
42 |     print(std::cout, args...);
43 | #else
44 |     unused(args...);
45 | #endif
46 | }
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/opencl_platform.h:
--------------------------------------------------------------------------------
  1 | #ifndef OPENCL_PLATFORM_H
  2 | #define OPENCL_PLATFORM_H
  3 | 
  4 | #include "platform.h"
  5 | 
  6 | #include <atomic>
  7 | #include <string>
  8 | #include <unordered_map>
  9 | #include <vector>
 10 | 
 11 | #ifdef __APPLE__
 12 | #include <OpenCL/cl.h>
 13 | #include <OpenCL/cl_ext.h>
 14 | #else
 15 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 16 | #include <CL/cl.h>
 17 | #include <CL/cl_ext.h>
 18 | #endif
 19 | 
 20 | /// OpenCL platform. Has the same number of devices as that of the OpenCL implementation.
 21 | class OpenCLPlatform : public Platform {
 22 | public:
 23 |     OpenCLPlatform(Runtime* runtime);
 24 |     ~OpenCLPlatform();
 25 | 
 26 | protected:
 27 |     void* alloc(DeviceId dev, int64_t size) override;
 28 |     void* alloc_host(DeviceId, int64_t) override { command_unavailable("alloc_host"); }
 29 |     void* alloc_unified(DeviceId, int64_t) override;
 30 |     void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
 31 |     void release(DeviceId dev, void* ptr) override;
 32 |     void release_host(DeviceId, void*) override { command_unavailable("release_host"); }
 33 | 
 34 |     void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
 35 |     void synchronize(DeviceId dev) override;
 36 | 
 37 |     void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
 38 |     void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
 39 |     void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
 40 |     void copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size);
 41 |     void dynamic_profile(DeviceId dev, const std::string& filename);
 42 | 
 43 |     size_t dev_count() const override { return devices_.size(); }
 44 |     std::string name() const override { return "OpenCL"; }
 45 |     const char* device_name(DeviceId dev) const override;
 46 |     bool device_check_feature_support(DeviceId, const char*) const override { return false; }
 47 | 
 48 |     typedef std::unordered_map<std::string, cl_kernel> KernelMap;
 49 | 
 50 |     struct DeviceData {
 51 |         OpenCLPlatform* parent;
 52 |         cl_platform_id platform;
 53 |         cl_device_id dev;
 54 |         cl_uint version_major;
 55 |         cl_uint version_minor;
 56 |         std::string platform_name;
 57 |         std::string device_name;
 58 |         cl_command_queue queue = nullptr;
 59 |         cl_context ctx = nullptr;
 60 |         #ifdef CL_VERSION_2_0
 61 |         cl_device_svm_capabilities svm_caps;
 62 |         #endif
 63 |         bool is_intel_fpga = false;
 64 |         bool is_xilinx_fpga = false;
 65 | 
 66 |         std::unordered_map<std::string, cl_program> programs;
 67 |         std::unordered_map<cl_program, KernelMap> kernels;
 68 |         std::unordered_map<cl_kernel, cl_command_queue> kernels_queue;
 69 | 
 70 |         // Atomics do not have a move constructor. This structure introduces one.
 71 |         struct AtomicData {
 72 |             std::atomic_int timings_counter {};
 73 |             std::atomic_flag lock = ATOMIC_FLAG_INIT;
 74 |             AtomicData() = default;
 75 |             AtomicData(AtomicData&&) {}
 76 |         } atomic_data;
 77 | 
 78 |         DeviceData(
 79 |             OpenCLPlatform* parent,
 80 |             cl_platform_id platform,
 81 |             cl_device_id dev,
 82 |             cl_uint version_major,
 83 |             cl_uint version_minor,
 84 |             const std::string& platform_name,
 85 |             const std::string& device_name)
 86 |             : parent(parent)
 87 |             , platform(platform)
 88 |             , dev(dev)
 89 |             , version_major(version_major)
 90 |             , version_minor(version_minor)
 91 |             , platform_name(platform_name)
 92 |             , device_name(device_name)
 93 |         {}
 94 |         DeviceData(DeviceData&&) = default;
 95 |         DeviceData(const DeviceData&) = delete;
 96 | 
 97 |         void lock() {
 98 |             while (atomic_data.lock.test_and_set(std::memory_order_acquire)) ;
 99 |         }
100 | 
101 |         void unlock() {
102 |             atomic_data.lock.clear(std::memory_order_release);
103 |         }
104 |     };
105 | 
106 |     std::vector<DeviceData> devices_;
107 | 
108 |     cl_kernel load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
109 |     cl_program load_program_binary(DeviceId dev, const std::string& filename, const std::string& program_string) const;
110 |     cl_program load_program_il(DeviceId dev, const std::string& filename, const std::string& program_string) const;
111 |     cl_program load_program_source(DeviceId dev, const std::string& filename, const std::string& program_string) const;
112 |     cl_program compile_program(DeviceId dev, cl_program program, const std::string& filename) const;
113 | 
114 |     friend void time_kernel_callback(cl_event, cl_int, void*);
115 | };
116 | 
117 | #endif
118 | 


--------------------------------------------------------------------------------
/src/pal/pal_device.h:
--------------------------------------------------------------------------------
  1 | #ifndef PAL_DEVICE_DATA_H
  2 | #define PAL_DEVICE_DATA_H
  3 | 
  4 | #include "../runtime.h"
  5 | #include "pal_utils.h"
  6 | 
  7 | #include <atomic>
  8 | #include <functional>
  9 | #include <string>
 10 | #include <unordered_map>
 11 | 
 12 | #include <pal.h>
 13 | #include <palLib.h>
 14 | #include <palPipeline.h>
 15 | #include <palPlatform.h>
 16 | #include <palQueue.h>
 17 | 
 18 | class PALPlatform;
 19 | 
 20 | class PalDevice {
 21 | public:
 22 |     typedef Pal::gpusize GpuVirtAddr_t;
 23 |     typedef std::unordered_map<std::string, Pal::IPipeline*> KernelMap;
 24 | 
 25 |     enum class queue_and_cmd_buffer_type { Compute, Universal, Dma };
 26 | 
 27 |     PalDevice(){};
 28 |     PalDevice(Pal::IDevice* base_device, Runtime* runtime);
 29 |     PalDevice(const PalDevice&) = delete;
 30 |     PalDevice(PalDevice&& other)
 31 |         : runtime_(other.runtime_)
 32 |         , device_(other.device_)
 33 |         , cmd_allocator_(other.cmd_allocator_)
 34 |         , queue_(other.queue_)
 35 |         , cmd_buffer_(other.cmd_buffer_)
 36 |         , profiling_timestamps_(other.profiling_timestamps_)
 37 |         , timestamps_frequency_(other.timestamps_frequency_)
 38 |         , programs_(std::move(other.programs_))
 39 |         , kernels_(std::move(other.kernels_))
 40 |         , memory_objects_(std::move(other.memory_objects_))
 41 |         , gfx_level(other.gfx_level)
 42 |         , isa(std::move(other.isa))
 43 |         , name(std::move(other.name)) {}
 44 | 
 45 |     ~PalDevice();
 46 | 
 47 |     void lock() {
 48 |         while (locked_.test_and_set(std::memory_order_acquire))
 49 |             ;
 50 |     }
 51 | 
 52 |     void unlock() { locked_.clear(std::memory_order_release); }
 53 | 
 54 |     Pal::IPipeline* create_pipeline(const void* elf_data, size_t elf_data_size);
 55 | 
 56 |     // Allocates memory of the requested size on the requested gpu heap (controls visibility).
 57 |     // Returns the virtual gpu address of the allocated memory.
 58 |     GpuVirtAddr_t allocate_gpu_memory(Pal::gpusize size_in_bytes, Pal::GpuHeap heap);
 59 | 
 60 |     GpuVirtAddr_t allocate_shared_virtual_memory(Pal::gpusize sizeInBytes);
 61 | 
 62 |     void release_gpu_memory(GpuVirtAddr_t virtual_address);
 63 |     void release_gpu_memory(void* virtual_address) {
 64 |         release_gpu_memory(reinterpret_cast<PalDevice::GpuVirtAddr_t>(virtual_address));
 65 |     }
 66 | 
 67 |     void copy_gpu_data(
 68 |         const GpuVirtAddr_t source, GpuVirtAddr_t destination, const Pal::MemoryCopyRegion& copy_region);
 69 |     void copy_gpu_data(const void* source, void* destination, const Pal::MemoryCopyRegion& copy_region) {
 70 |         copy_gpu_data(reinterpret_cast<PalDevice::GpuVirtAddr_t>(source),
 71 |             reinterpret_cast<PalDevice::GpuVirtAddr_t>(destination), copy_region);
 72 |     }
 73 | 
 74 |     void dispatch(const Pal::CmdBufferBuildInfo& cmd_buffer_build_info,
 75 |         const Pal::PipelineBindParams& pipeline_bind_params, const Pal::BarrierInfo& barrier_info,
 76 |         const LaunchParams& launch_params);
 77 | 
 78 |     void WaitIdle();
 79 | 
 80 | private:
 81 |     friend PALPlatform;
 82 | 
 83 |     Pal::Result init();
 84 | 
 85 |     // Creates a PAL queue object and corresponding command buffer object into the given pointers.
 86 |     Pal::Result init_queue_and_cmd_buffer(queue_and_cmd_buffer_type type, Pal::IQueue*& queue, Pal::ICmdBuffer*& cmd_buffer);
 87 | 
 88 |     // Creates a PAL command allocator which is needed to allocate memory for all
 89 |     // command buffer objects.
 90 |     Pal::Result init_cmd_allocator();
 91 | 
 92 |     Pal::Result allocate_memory(Pal::gpusize size_in_bytes, Pal::GpuHeap heap,
 93 |         Pal::IGpuMemory** gpu_memory_pp, Pal::gpusize alignment = 256 * 1024);
 94 | 
 95 |     // Returns the key to the new map entry. This key is the virtual gpu memory address.
 96 |     GpuVirtAddr_t track_memory(Pal::IGpuMemory* memory);
 97 |     void forget_memory(GpuVirtAddr_t gpu_address);
 98 |     // Returns nullptr if key is not present in memory_objects map.
 99 |     Pal::IGpuMemory* get_memory_object(const GpuVirtAddr_t gpu_address) const;
100 |     Pal::IGpuMemory* get_memory_object(const void* gpu_address) const {
101 |         return get_memory_object(reinterpret_cast<PalDevice::GpuVirtAddr_t>(gpu_address));
102 |     }
103 | 
104 |     // Build a buffer holding the kernel arguments and upload to the GPU.
105 |     // Returns the address of the buffer on the gpu.
106 |     GpuVirtAddr_t build_kernargs_buffer(const ParamsArgs& params_args, int num_args, const char* kernel_name);
107 | 
108 |     // Helper function that allocates a gpu-only buffer of the given size and uploads the data written by the
109 |     // write_callback
110 |     PalDevice::GpuVirtAddr_t write_data_to_gpu(
111 |         Pal::gpusize byte_size, std::function<void(void*)> write_callback);
112 | 
113 |     uint32_t calculate_launch_params_size(const ParamsArgs& params_args, uint32_t num_args);
114 |     // Write kernel arguments to memory. Returns the number of bytes occupied by the passed in kernel arguments.
115 |     size_t write_launch_params(const ParamsArgs& params_args, uint32_t num_args, void* memory, size_t memory_size);
116 | 
117 | private:
118 |     Runtime* runtime_ = nullptr;
119 | 
120 |     Pal::IDevice* device_ = nullptr;
121 |     Pal::ICmdAllocator* cmd_allocator_ = nullptr;
122 |     Pal::IQueue* queue_ = nullptr;
123 |     Pal::ICmdBuffer* cmd_buffer_ = nullptr;
124 | 
125 |     Pal::IQueue* dma_queue_ = nullptr;
126 |     Pal::ICmdBuffer* dma_cmd_buffer_ = nullptr;
127 | 
128 |     struct ProfilingTimestamps {
129 |         uint64_t start;
130 |         uint64_t end;
131 |     };
132 |     Pal::IGpuMemory* profiling_timestamps_ = nullptr;
133 |     uint64_t timestamps_frequency_ = 0;
134 | 
135 |     std::atomic_flag locked_ = ATOMIC_FLAG_INIT;
136 | 
137 |     std::unordered_map<std::string, Pal::IPipeline*> programs_;
138 |     std::unordered_map<uint64_t, KernelMap> kernels_;
139 | 
140 |     // Map virtual addresses on the GPU to the PAL objects representing the memory.
141 |     // This is needed because AnyDSL assumes it deals with gpu-legal addresses in its API.
142 |     // However, to interact with PAL we need to have the wrapper objects at hand.
143 |     // The IGpuMemory objects should not be used outside of this class.
144 |     std::unordered_map<GpuVirtAddr_t, Pal::IGpuMemory*> memory_objects_;
145 | 
146 | public:
147 |     Pal::GfxIpLevel gfx_level;
148 |     std::string isa;
149 |     std::string name;
150 | };
151 | 
152 | #endif


--------------------------------------------------------------------------------
/src/pal/pal_fix_calling_convention_pass.cpp:
--------------------------------------------------------------------------------
 1 | #include "pal_fix_calling_convention_pass.h"
 2 | #include "pal_utils.h"
 3 | 
 4 | #include <unordered_set>
 5 | 
 6 | #include <llvm/IR/CallingConv.h>
 7 | #include <llvm/IR/Function.h>
 8 | #include <llvm/IR/IRBuilder.h>
 9 | 
10 | using namespace llvm;
11 | 
12 | void fix_calling_conv(Module* m, Function* f, std::unordered_set<Function*>& traversed_functions) {
13 |     if (traversed_functions.find(f) != traversed_functions.end()) {
14 |         // already visited this function -> prevent recursive loop
15 |         return;
16 |     }
17 | 
18 |     traversed_functions.insert(f);
19 |     f->addFnAttr(llvm::Attribute::AlwaysInline);
20 | 
21 |     // Find and inspect all function calls inside of this function
22 |     for (auto& bb : *f) {
23 |         for (auto& instruction : bb) {
24 |             if (CallInst* call_inst = dyn_cast<CallInst>(&instruction)) {
25 |                 if (call_inst->getCallingConv() != CallingConv::AMDGPU_Gfx) {
26 |                     call_inst->setCallingConv(CallingConv::AMDGPU_Gfx);
27 |                 }
28 | 
29 |                 if (Function* called_function = call_inst->getCalledFunction()) {
30 |                     fix_calling_conv(m, called_function, traversed_functions);
31 |                 }
32 |             }
33 |         }
34 |     }
35 | }
36 | 
37 | PreservedAnalyses PalPlatformFixCallingConventionPass::run(Module& M, ModuleAnalysisManager&) {
38 |     std::unordered_set<Function*> traversed_functions = {};
39 |     for (Function& entrypoint_fn : M) {
40 |         fix_calling_conv(&M, &entrypoint_fn, traversed_functions);
41 |     }
42 |     return PreservedAnalyses::all();
43 | }
44 | 


--------------------------------------------------------------------------------
/src/pal/pal_fix_calling_convention_pass.h:
--------------------------------------------------------------------------------
 1 | #ifndef PAL_PLATFORM_FIX_CALLING_CONVENTION_H
 2 | #define PAL_PLATFORM_FIX_CALLING_CONVENTION_H
 3 | 
 4 | #include <llvm/IR/PassManager.h>
 5 | 
 6 | /// This pass sets the calling convention to AMDGPU_Gfx for all calls in the given module and sets the AlwaysInline
 7 | /// Attribute on every called function in the module to avoid the LLVM AMDGPU backend throwing errors.
 8 | struct PalPlatformFixCallingConventionPass : llvm::PassInfoMixin<PalPlatformFixCallingConventionPass> {
 9 |     llvm::PreservedAnalyses run(llvm::Module& M, llvm::ModuleAnalysisManager&);
10 | };
11 | 
12 | #endif // PAL_PLATFORM_FIX_CALLING_CONVENTION_H


--------------------------------------------------------------------------------
/src/pal/pal_insert_halt_pass.cpp:
--------------------------------------------------------------------------------
 1 | #include "pal_insert_halt_pass.h"
 2 | #include "pal_utils.h"
 3 | 
 4 | #include <llvm/IR/CallingConv.h>
 5 | #include <llvm/IR/Function.h>
 6 | #include <llvm/IR/IRBuilder.h>
 7 | #include <llvm/IR/InlineAsm.h>
 8 | 
 9 | #include <cstdlib>
10 | 
11 | using namespace llvm;
12 | 
13 | PreservedAnalyses PalPlatformInsertHaltPass::run(Function& F, FunctionAnalysisManager&) {
14 |     char* halt_immediately = std::getenv("HALT_IMMEDIATELY");
15 |     if (F.getName() != pal_utils::ComputeShaderMainFnName || !halt_immediately
16 |         || strcmp(halt_immediately, "ON") != 0) {
17 |         return PreservedAnalyses::all();
18 |     }
19 |     assert(F.getCallingConv() == CallingConv::AMDGPU_CS);
20 |     LLVMContext& Ctx = F.getParent()->getContext();
21 |     BasicBlock& EntryBlock = *F.begin();
22 |     IRBuilder<> Builder(&(*EntryBlock.getFirstInsertionPt()));
23 |     ArrayRef<Value*> inline_asm_args;
24 |     InlineAsm* inline_assembly = InlineAsm::get(
25 |         FunctionType::get(Type::getVoidTy(Ctx), false), "s_sethalt 1", "", true, false, InlineAsm::AD_ATT);
26 |     Builder.CreateCall(inline_assembly, inline_asm_args);
27 |     return PreservedAnalyses::none();
28 | }


--------------------------------------------------------------------------------
/src/pal/pal_insert_halt_pass.h:
--------------------------------------------------------------------------------
 1 | #ifndef PAL_PLATFORM_INSERT_HALT_PASS_H
 2 | #define PAL_PLATFORM_INSERT_HALT_PASS_H
 3 | 
 4 | #include <llvm/IR/PassManager.h>
 5 | #include <llvm/Pass.h>
 6 | 
 7 | /// Pass that inserts RDNA specific assembly to halt a shader as soon as it starts if the environment variable
 8 | /// "HALT_IMMEDIATELY" is set to the value "ON"
 9 | struct PalPlatformInsertHaltPass : llvm::PassInfoMixin<PalPlatformInsertHaltPass> {
10 |     llvm::PreservedAnalyses run(llvm::Function& F, llvm::FunctionAnalysisManager& FAM);
11 | };
12 | 
13 | #endif // PAL_PLATFORM_INSERT_HALT_PASS_H


--------------------------------------------------------------------------------
/src/pal/pal_lower_builtins_pass.cpp:
--------------------------------------------------------------------------------
  1 | #include "pal_lower_builtins_pass.h"
  2 | #include "pal_utils.h"
  3 | 
  4 | #include <array>
  5 | #include <unordered_map>
  6 | #include <unordered_set>
  7 | #include <vector>
  8 | 
  9 | #include <llvm/IR/CallingConv.h>
 10 | #include <llvm/IR/Function.h>
 11 | #include <llvm/IR/GlobalValue.h>
 12 | #include <llvm/IR/IRBuilder.h>
 13 | #include <llvm/IR/InlineAsm.h>
 14 | #include <llvm/IR/MDBuilder.h>
 15 | #include <llvm/Target/TargetMachine.h>
 16 | 
 17 | using namespace llvm;
 18 | 
 19 | namespace {
 20 | // anonymous namespace to avoid name clashes
 21 | 
 22 | enum Builtins : int8_t {
 23 |     workitem_id_x = 0,
 24 |     workitem_id_y,
 25 |     workitem_id_z,
 26 |     workgroup_id_x,
 27 |     workgroup_id_y,
 28 |     workgroup_id_z,
 29 |     nblk_x,
 30 |     nblk_y,
 31 |     nblk_z,
 32 |     // Dynamic builtins (i.e., inlined code based on supplied metadata)
 33 |     workgroup_size_x,
 34 |     workgroup_size_y,
 35 |     workgroup_size_z,
 36 |     count
 37 | };
 38 | 
 39 | constexpr const char* BuiltinNames[] = {
 40 |     "anydsl.amdpal.workitem.id.x",
 41 |     "anydsl.amdpal.workitem.id.y",
 42 |     "anydsl.amdpal.workitem.id.z",
 43 |     "anydsl.amdpal.workgroup.id.x",
 44 |     "anydsl.amdpal.workgroup.id.y",
 45 |     "anydsl.amdpal.workgroup.id.z",
 46 |     "anydsl.amdpal.nblk.x",
 47 |     "anydsl.amdpal.nblk.y",
 48 |     "anydsl.amdpal.nblk.z",
 49 |     // Dynamic builtins (i.e., inlined code based on supplied metadata)
 50 |     "anydsl.amdpal.workgroup.size.x",
 51 |     "anydsl.amdpal.workgroup.size.y",
 52 |     "anydsl.amdpal.workgroup.size.z",
 53 | };
 54 | 
 55 | struct BuiltinAssemblyInfo {
 56 |     const char* asmString;
 57 |     const char* asmConstraints;
 58 | };
 59 | 
 60 | // PAL SGPR layout:
 61 | // s0-1:   PAL reserved data -> set up by PAL because of pipeline register configuration in PALPlatform
 62 | // s2-3:   pointer to pal kernel args (for compute shader)
 63 | //         -> set up by AnyDSL PALPlatform
 64 | // s4-5:   pointer to NumWorkGroups struct (i.e., nblk)
 65 | // s6-12:  reserved for future use
 66 | // s13-15: work group id x, y, and z -> set up by AnyDSL PALPlatform by supplying pgm_rsrc2
 67 | //         ENABLE_SGPR_WORKGROUP_ID_<x/y/z> to PAL pipeline setup
 68 | 
 69 | const BuiltinAssemblyInfo BuiltinAssemblyInfos[]{
 70 |     // workitem_id_x
 71 |     {
 72 |         .asmString = "; local thread id x is in v0",
 73 |         .asmConstraints = "={v0}",
 74 |     },
 75 |     // workitem_id_y
 76 |     {
 77 |         .asmString = "; local thread id y is in v1",
 78 |         .asmConstraints = "={v1}",
 79 |     },
 80 |     // workitem_id_z
 81 |     {
 82 |         .asmString = "; local thread id z is in v2",
 83 |         .asmConstraints = "={v2}",
 84 |     },
 85 |     // workgroup_id_x
 86 |     {
 87 |         .asmString = "; workgroup id x is in s13",
 88 |         .asmConstraints = "={s13}",
 89 |     },
 90 |     // workgroup_id_y
 91 |     {
 92 |         .asmString = "; workgroup id y is in s14",
 93 |         .asmConstraints = "={s14}",
 94 |     },
 95 |     // workgroup_id_z
 96 |     {
 97 |         .asmString = "; workgroup id z is in s15",
 98 |         .asmConstraints = "={s15}",
 99 |     },
100 |     // nblk_x
101 |     {
102 |         .asmString = "s_load_dword $0, s[4:5], 0x00",
103 |         .asmConstraints = "=s",
104 |     },
105 |     // nblk_y
106 |     {
107 |         .asmString = "s_load_dword $0, s[4:5], 0x04",
108 |         .asmConstraints = "=s",
109 |     },
110 |     // nblk_z
111 |     {
112 |         .asmString = "s_load_dword $0, s[4:5], 0x08",
113 |         .asmConstraints = "=s",
114 |     },
115 | };
116 | 
117 | typedef std::array<std::vector<CallInst*>, static_cast<size_t>(Builtins::count)> BuiltinsCallInstMap;
118 | 
119 | Builtins GetBuiltinID(Function* f) {
120 |     const StringRef f_name = f->getName();
121 |     for (int8_t i = 0; i < Builtins::count; ++i) {
122 |         if (f_name == BuiltinNames[i]) {
123 |             return Builtins(i);
124 |         }
125 |     }
126 |     return Builtins::count;
127 | }
128 | 
129 | const BuiltinAssemblyInfo& GetAssemblyInfo(Builtins builtinID) {
130 |     return BuiltinAssemblyInfos[static_cast<int>(builtinID)];
131 | }
132 | 
133 | bool IsBuiltin(Function* f) { return GetBuiltinID(f) < Builtins::count; }
134 | 
135 | void find_builtins_calls(Module* m, Function* f, BuiltinsCallInstMap& builtins_call_instances,
136 |     std::unordered_set<Function*>& traversed_functions) {
137 |     if (traversed_functions.find(f) != traversed_functions.end()) {
138 |         // already visited this function -> prevent recursive loop
139 |         return;
140 |     }
141 | 
142 |     traversed_functions.insert(f);
143 | 
144 |     // Find and inspect all function calls inside of this function
145 |     for (auto& bb : *f) {
146 |         for (auto& instruction : bb) {
147 |             CallInst* callInst = dyn_cast<CallInst>(&instruction);
148 |             if (!callInst) {
149 |                 continue;
150 |             }
151 |             
152 |             Function* calledFunction = callInst->getCalledFunction();
153 |             if (!calledFunction) {
154 |                 continue;
155 |             }
156 | 
157 |             if (IsBuiltin(calledFunction)) {
158 |                 // If the call we found is calling a builtin, record the builtins usage
159 |                 Builtins builtinID = GetBuiltinID(calledFunction);
160 |                 builtins_call_instances[static_cast<int>(builtinID)].push_back(callInst);
161 |             } else if (calledFunction->getParent() == m) {
162 |                 // If the called function is within this module, recursively search it for
163 |                 // builtins used
164 |                 find_builtins_calls(m, calledFunction, builtins_call_instances, traversed_functions);
165 |             }
166 |         }
167 |     }
168 | }
169 | 
170 | Function* find_entrypoint(Module& M) {
171 |     for (Function& F : M) {
172 |         const auto name = F.getName();
173 |         if (name.equals(pal_utils::ComputeShaderMainFnName))
174 |             return &F;
175 |     }
176 | 
177 |     return nullptr;
178 | }
179 | 
180 | // Function taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
181 | BasicBlock::iterator getInsertPt(BasicBlock& BB) {
182 |     BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
183 |     for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
184 |         AllocaInst* AI = dyn_cast<AllocaInst>(&*InsPt);
185 | 
186 |         // If this is a dynamic alloca, the value may depend on the loaded kernargs,
187 |         // so loads will need to be inserted before it.
188 |         if (!AI || !AI->isStaticAlloca())
189 |             break;
190 |     }
191 | 
192 |     return InsPt;
193 | }
194 | 
195 | CallInst* insert_asm(
196 |     IRBuilder<>& Builder, LLVMContext& Ctx, const char* asm_string, const char* asm_constraint) {
197 |     ArrayRef<Value*> inline_asm_args;
198 |     InlineAsm* inline_assembly = InlineAsm::get(FunctionType::get(Type::getInt32Ty(Ctx), false), asm_string,
199 |         asm_constraint, true, false, InlineAsm::AD_ATT);
200 |     return Builder.CreateCall(inline_assembly, inline_asm_args);
201 | }
202 | 
203 | // Inserts assembly code to split the local thread id from v0 into v0(x), v1(y) and v2(z).
204 | // This is only applicable for GPUs >= gfx 11.
205 | void insert_asm_to_split_local_thread_id(IRBuilder<>& Builder, LLVMContext& Ctx,
206 |     const BuiltinsCallInstMap& builtins_call_instances, Pal::GfxIpLevel gfx_level) {
207 |     assert(gfx_level >= Pal::GfxIpLevel::GfxIp11_0);
208 |     // Write local thread id z into v2.
209 |     if (!builtins_call_instances[Builtins::workitem_id_z].empty()) {
210 |         insert_asm(Builder, Ctx,
211 |             "; def v2 local thread id z is in v0[29:20] (v0[31:30] set to 0 by hardware)\n\t"
212 |             "V_LSHRREV_B32 v2 20 v0",
213 |             "={v2}");
214 |     }
215 |     // Write local thread id y into v1.
216 |     if (!builtins_call_instances[Builtins::workitem_id_y].empty()) {
217 |         insert_asm(Builder, Ctx,
218 |             "; def v1 local thread id y is in v0[19:10]\n\t"
219 |             "V_LSHRREV_B32 v1 10 v0\n\t"
220 |             "V_AND_B32 v1 v1 0x3FF",
221 |             "={v1}");
222 |     }
223 |     // Write local thread id x into v0 last to make sure v0 is not overwritten yet.
224 |     if (!builtins_call_instances[Builtins::workitem_id_x].empty()) {
225 |         insert_asm(Builder, Ctx,
226 |             "; def v0 local thread id x is in v0[9:0]\n\t"
227 |             "V_AND_B32 v0 v0 0x3FF",
228 |             "={v0}");
229 |     }
230 | }
231 | 
232 | } // namespace
233 | 
234 | PreservedAnalyses PalPlatformLowerBuiltinsPass::run(Module& M, ModuleAnalysisManager&) {
235 |     Function* entrypoint_fn = find_entrypoint(M);
236 |     assert(entrypoint_fn);
237 | 
238 |     /*
239 |     Find all calls to builtins and unique them
240 |     -> i.e. every builtin is only called exactly once right at the beginning of the shader.
241 | 
242 |     for each instruction in entrypoint:
243 |         if call to builtin:
244 |             record builtin (unique set of used_builtins + all separate calls to them!)
245 |         elif call to another function inside this module:
246 |             recursively find all calls of used built_ins
247 |         else: don't care
248 | 
249 |     for each used_builtin:
250 |         Value* real_builtin = insert inline_asm at beginning of entrypoint
251 |         for each call instance of the builtin:
252 |             replace all uses of call instance with real_builtin
253 |             remove old call instance
254 |     */
255 | 
256 |     BuiltinsCallInstMap builtins_call_instances;
257 |     std::unordered_set<Function*> traversed_functions = {};
258 |     find_builtins_calls(&M, entrypoint_fn, builtins_call_instances, traversed_functions);
259 | 
260 |     LLVMContext& Ctx = M.getContext();
261 |     BasicBlock& EntryBlock = *entrypoint_fn->begin();
262 |     IRBuilder<> Builder(&*getInsertPt(EntryBlock));
263 | 
264 |     if (gfx_level_ >= Pal::GfxIpLevel::GfxIp11_0) {
265 |         insert_asm_to_split_local_thread_id(Builder, Ctx, builtins_call_instances, gfx_level_);
266 |     }
267 | 
268 |     int builtins_count = static_cast<int>(Builtins::count);
269 |     for (int i = 0; i < builtins_count; ++i) {
270 |         const Builtins builtin_id = Builtins(i);
271 |         const std::vector<CallInst*> builtin_call_instances = builtins_call_instances[i];
272 |         if (builtin_call_instances.empty()) {
273 |             continue;
274 |         }
275 | 
276 |         CallInst* lowered_unique_builtin = nullptr;
277 |         switch (builtin_id) {
278 |             case Builtins::workgroup_size_x:
279 |                 lowered_unique_builtin = insert_asm(Builder, Ctx,
280 |                     ("s_mov_b32 $0, " + std::to_string(tg_dims_[0]) + "; workgroup size x").c_str(), "=s");
281 |                 break;
282 |             case Builtins::workgroup_size_y:
283 |                 lowered_unique_builtin = insert_asm(Builder, Ctx,
284 |                     ("s_mov_b32 $0, " + std::to_string(tg_dims_[1]) + "; workgroup size y").c_str(), "=s");
285 |                 break;
286 |             case Builtins::workgroup_size_z:
287 |                 lowered_unique_builtin = insert_asm(Builder, Ctx,
288 |                     ("s_mov_b32 $0, " + std::to_string(tg_dims_[2]) + "; workgroup size z").c_str(), "=s");
289 |                 break;
290 |             default:
291 |                 const auto& assemblyInfo = GetAssemblyInfo(builtin_id);
292 |                 lowered_unique_builtin =
293 |                     insert_asm(Builder, Ctx, assemblyInfo.asmString, assemblyInfo.asmConstraints);
294 |         }
295 | 
296 |         for (CallInst* call_to_builtin : builtin_call_instances) {
297 |             call_to_builtin->replaceAllUsesWith(lowered_unique_builtin);
298 |         }
299 |     }
300 | 
301 |     for (int i = 0; i < static_cast<int>(builtins_count); ++i) {
302 |         const std::vector<CallInst*> builtin_call_instances = builtins_call_instances[i];
303 |         for (CallInst* call_to_builtin : builtin_call_instances) {
304 |             call_to_builtin->eraseFromParent();
305 |         }
306 |     }
307 |     // All uncalled functions from the module have to be removed because any kernels other than the one
308 |     // marked as entrypoint may contain calls to builtins which have not been resolved by this pass but
309 |     // may trip up linkers/relocations. Therefore we set all functions to internal linkage, except the
310 |     // known entrypoint. This way, the global dead code elimination pass can remove them for us.
311 |     for (Function& F : M) {
312 |         if (F.getName().startswith("llvm")) {
313 |             // Don't mark llvm intrinsics as internal linkage, otherwise they get
314 |             // altered/removed which breaks backend codegen.
315 |             continue;
316 |         }
317 |         F.setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
318 |     }
319 |     entrypoint_fn->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
320 | 
321 |     return PreservedAnalyses::none();
322 | }


--------------------------------------------------------------------------------
/src/pal/pal_lower_builtins_pass.h:
--------------------------------------------------------------------------------
 1 | #ifndef PAL_PLATFORM_LOWER_BUILTINS_H
 2 | #define PAL_PLATFORM_LOWER_BUILTINS_H
 3 | 
 4 | #include <llvm/CodeGen/TargetPassConfig.h>
 5 | #include <llvm/IR/LegacyPassManager.h>
 6 | #include <llvm/IR/PassManager.h>
 7 | #include <llvm/Pass.h>
 8 | #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 9 | 
10 | #include <pal.h>
11 | #include <palDevice.h>
12 | 
13 | /// This pass takes care of replacing calls to so-called "builtins" (i.e. local/global thread indices, and similar
14 | /// compute shader builtin values) with the appropriate amdgpu inline assembly that extracts the values from
15 | /// prepopulated SGPRs according to the RDNA2 or RDNA3 ABI. This pass only supports gfx-levels 10 and 11.
16 | struct PalPlatformLowerBuiltinsPass : llvm::PassInfoMixin<PalPlatformLowerBuiltinsPass> {
17 |     PalPlatformLowerBuiltinsPass(
18 |         Pal::GfxIpLevel gfx_level, std::array<uint64_t, 3> tg_dims)
19 |         : gfx_level_(gfx_level)
20 |         , tg_dims_(tg_dims) {}
21 | 
22 |     PalPlatformLowerBuiltinsPass(const PalPlatformLowerBuiltinsPass& other) = default;
23 |     PalPlatformLowerBuiltinsPass& operator=(const PalPlatformLowerBuiltinsPass& other) = default;
24 |     PalPlatformLowerBuiltinsPass(PalPlatformLowerBuiltinsPass&& other) = default;
25 |     PalPlatformLowerBuiltinsPass& operator=(PalPlatformLowerBuiltinsPass&& other) = default;
26 |     ~PalPlatformLowerBuiltinsPass() = default;
27 | 
28 |     llvm::PreservedAnalyses run(llvm::Module& M, llvm::ModuleAnalysisManager&);
29 | 
30 | private:
31 |     Pal::GfxIpLevel gfx_level_;
32 |     std::array<uint64_t, 3> tg_dims_;
33 | };
34 | 
35 | #endif // PAL_PLATFORM_LOWER_BUILTINS_H


--------------------------------------------------------------------------------
/src/pal/pal_lower_kernel_arguments_pass.cpp:
--------------------------------------------------------------------------------
  1 | #include "pal_lower_kernel_arguments_pass.h"
  2 | #include "pal_utils.h"
  3 | 
  4 | #include <optional>
  5 | 
  6 | #include <llvm/IR/BasicBlock.h>
  7 | #include <llvm/IR/CallingConv.h>
  8 | #include <llvm/IR/Function.h>
  9 | #include <llvm/IR/IRBuilder.h>
 10 | #include <llvm/IR/InlineAsm.h>
 11 | #include <llvm/IR/MDBuilder.h>
 12 | #include <llvm/Support/Alignment.h>
 13 | 
 14 | using namespace llvm;
 15 | 
 16 | namespace {
 17 | // anonymous namespace to avoid name clashes
 18 | 
 19 | // Function taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
 20 | BasicBlock::iterator getInsertPt(BasicBlock& BB) {
 21 |     BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
 22 |     for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
 23 |         AllocaInst* AI = dyn_cast<AllocaInst>(&*InsPt);
 24 | 
 25 |         // If this is a dynamic alloca, the value may depend on the loaded kernargs,
 26 |         // so loads will need to be inserted before it.
 27 |         if (!AI || !AI->isStaticAlloca())
 28 |             break;
 29 |     }
 30 | 
 31 |     return InsPt;
 32 | }
 33 | 
 34 | // Function based on AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, Align &MaxAlign)
 35 | // Taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
 36 | uint64_t getExplicitKernArgSize(const Function& F, Align& MaxAlign) {
 37 |     assert(F.getCallingConv() == CallingConv::AMDGPU_CS);
 38 | 
 39 |     const DataLayout& DL = F.getParent()->getDataLayout();
 40 |     uint64_t ExplicitArgBytes = 0;
 41 |     MaxAlign = Align(1);
 42 | 
 43 |     for (const Argument& Arg : F.args()) {
 44 |         const bool IsByRef = Arg.hasByRefAttr();
 45 |         Type* ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
 46 |         MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
 47 |         Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
 48 | 
 49 |         uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 50 |         ExplicitArgBytes = alignTo(ExplicitArgBytes, ABITypeAlign) + AllocSize;
 51 |         MaxAlign = std::max(MaxAlign, ABITypeAlign);
 52 |     }
 53 | 
 54 |     return ExplicitArgBytes;
 55 | }
 56 | 
 57 | // Function based on AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, Align &MaxAlign)
 58 | // Taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
 59 | unsigned getKernArgSegmentSize(const Function& F, Align& MaxAlign) {
 60 |     uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 61 |     unsigned ExplicitOffset = 0;
 62 |     // Being able to dereference past the end is useful for emitting scalar loads.
 63 |     return alignTo(ExplicitOffset + ExplicitArgBytes, 4);
 64 | }
 65 | } // namespace
 66 | 
 67 | // Largely based on the function AMDGPULowerKernelArguments::runOnFunction(Function &F)
 68 | // taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
 69 | // Minor adaptations added to satisfy the AnyDSL PALPlatform requirements.
 70 | PreservedAnalyses PalPlatformLowerKernelArgumentsPass::run(Function& F, FunctionAnalysisManager&) {
 71 |     const auto& funcname = F.getName();
 72 |     if (funcname != pal_utils::ComputeShaderMainFnName || F.arg_empty()) {
 73 |         // Only the entry point function's parameters are kernel arguments that need to be lowered.
 74 |         return PreservedAnalyses::all();
 75 |     }
 76 |     assert(F.getCallingConv() == CallingConv::AMDGPU_CS);
 77 | 
 78 |     LLVMContext& Ctx = F.getParent()->getContext();
 79 |     const DataLayout& DL = F.getParent()->getDataLayout();
 80 |     BasicBlock& EntryBlock = *F.begin();
 81 |     IRBuilder<> Builder(&*getInsertPt(EntryBlock));
 82 | 
 83 |     const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
 84 |     const uint64_t BaseOffset = 0;    // We don't have any data preceding the kernel arguments
 85 | 
 86 |     Align MaxAlign;
 87 |     // TODO: We have to extract that from the Function arguments ourselves!
 88 |     const uint64_t TotalKernArgSize = getKernArgSegmentSize(F, MaxAlign);
 89 |     if (TotalKernArgSize == 0)
 90 |         return PreservedAnalyses::all();
 91 | 
 92 |     // Generate Our own ISA to get the pointer to the buffer containing the kernel arguments
 93 |     // PALPlatform ensures that registers s[2:3] contain this address when the kernel starts execution
 94 |     std::string asmString = std::string("; def $0 pointer to buffer containing the kernel args is set up in s[2:3]");
 95 |     // Constraints reference: https://llvm.org/docs/LangRef.html#inline-asm-constraint-string
 96 |     // This constraint states that our inline assembly returns ("="-prefix indicates constraint for output)
 97 |     // its result in sgprs 2-3
 98 |     StringRef constraints = "={s[2:3]}";
 99 |     ArrayRef<Value*> inline_asm_args = std::nullopt;
100 | 
101 |     // Value taken from AMDGPU.h (namespace AMDGPUAS)
102 |     // global address space pointing to memory that won't change during execution
103 |     unsigned CONSTANT_ADDRESS = 4;
104 |     InlineAsm* inline_assembly =
105 |         InlineAsm::get(FunctionType::get(Type::getInt8PtrTy(Ctx, CONSTANT_ADDRESS), false), asmString.c_str(),
106 |             constraints, true, false, InlineAsm::AD_ATT);
107 |     CallInst* KernArgSegment = Builder.CreateCall(inline_assembly, inline_asm_args);
108 | 
109 |     KernArgSegment->addRetAttr(Attribute::NonNull);
110 |     KernArgSegment->addRetAttr(Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
111 |     unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
112 | 
113 |     uint64_t ExplicitArgOffset = 0;
114 | 
115 |     for (Argument& Arg : F.args()) {
116 |         const bool IsByRef = Arg.hasByRefAttr();
117 |         Type* ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
118 |         MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
119 |         Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
120 | 
121 |         uint64_t Size = DL.getTypeSizeInBits(ArgTy);
122 |         uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
123 | 
124 |         uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
125 |         ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
126 | 
127 |         if (Arg.use_empty())
128 |             continue;
129 | 
130 |         // If this is byval, the loads are already explicit in the function. We just
131 |         // need to rewrite the pointer values.
132 |         if (IsByRef) {
133 |             Value* ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
134 |                 Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".byval.kernarg.offset");
135 | 
136 |             Value* CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(ArgOffsetPtr, Arg.getType());
137 |             Arg.replaceAllUsesWith(CastOffsetPtr);
138 |             continue;
139 |         }
140 | 
141 |         if (PointerType* PT = dyn_cast<PointerType>(ArgTy)) {
142 |             // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
143 |             // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
144 |             // can't represent this with range metadata because it's only allowed for
145 |             // integer types.
146 | 
147 |             // Values taken from AMDGPU.h (namespace AMDGPUAS)
148 |             const unsigned REGION_ADDRESS = 2; ///< Address space for region memory. (GDS)
149 |             const unsigned LOCAL_ADDRESS = 3;  ///< Address space for local memory.
150 |             if ((PT->getAddressSpace() == LOCAL_ADDRESS || PT->getAddressSpace() == REGION_ADDRESS))
151 |                 continue;
152 | 
153 |             // FIXME: We can replace this with equivalent alias.scope/noalias
154 |             // metadata, but this appears to be a lot of work.
155 |             if (Arg.hasNoAliasAttr())
156 |                 continue;
157 |         }
158 | 
159 |         auto* VT = dyn_cast<FixedVectorType>(ArgTy);
160 |         bool IsV3 = VT && VT->getNumElements() == 3;
161 |         bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
162 | 
163 |         VectorType* V4Ty = nullptr;
164 | 
165 |         int64_t AlignDownOffset = alignDown(EltOffset, 4);
166 |         int64_t OffsetDiff = EltOffset - AlignDownOffset;
167 |         Align AdjustedAlign = commonAlignment(KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
168 | 
169 |         Value* ArgPtr;
170 |         Type* AdjustedArgTy;
171 |         if (DoShiftOpt) { // FIXME: Handle aggregate types
172 |             // Since we don't have sub-dword scalar loads, avoid doing an extload by
173 |             // loading earlier than the argument address, and extracting the relevant
174 |             // bits.
175 |             //
176 |             // Additionally widen any sub-dword load to i32 even if suitably aligned,
177 |             // so that CSE between different argument loads works easily.
178 |             ArgPtr = Builder.CreateConstInBoundsGEP1_64(Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
179 |                 Arg.getName() + ".kernarg.offset.align.down");
180 |             AdjustedArgTy = Builder.getInt32Ty();
181 |         } else {
182 |             ArgPtr = Builder.CreateConstInBoundsGEP1_64(
183 |                 Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".kernarg.offset");
184 |             AdjustedArgTy = ArgTy;
185 |         }
186 | 
187 |         if (IsV3 && Size >= 32) {
188 |             V4Ty = FixedVectorType::get(VT->getElementType(), 4);
189 |             // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
190 |             AdjustedArgTy = V4Ty;
191 |         }
192 | 
193 |         ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast");
194 |         LoadInst* Load = Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
195 |         Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
196 | 
197 |         MDBuilder MDB(Ctx);
198 | 
199 |         if (isa<PointerType>(ArgTy)) {
200 |             if (Arg.hasNonNullAttr())
201 |                 Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
202 | 
203 |             uint64_t DerefBytes = Arg.getDereferenceableBytes();
204 |             if (DerefBytes != 0) {
205 |                 Load->setMetadata(LLVMContext::MD_dereferenceable,
206 |                     MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
207 |             }
208 | 
209 |             uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
210 |             if (DerefOrNullBytes != 0) {
211 |                 Load->setMetadata(LLVMContext::MD_dereferenceable_or_null,
212 |                     MDNode::get(
213 |                         Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefOrNullBytes))));
214 |             }
215 | 
216 |             auto ParamMaybeAlign = Arg.getParamAlign();
217 |             if (ParamMaybeAlign.has_value()) {
218 |                 Load->setMetadata(LLVMContext::MD_align,
219 |                     MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(
220 |                                          Builder.getInt64Ty(), ParamMaybeAlign.valueOrOne().value()))));
221 |             }
222 |         }
223 | 
224 |         // TODO: Convert noalias arg to !noalias
225 | 
226 |         if (DoShiftOpt) {
227 |             Value* ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8);
228 | 
229 |             IntegerType* ArgIntTy = Builder.getIntNTy(Size);
230 |             Value* Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
231 |             Value* NewVal = Builder.CreateBitCast(Trunc, ArgTy, Arg.getName() + ".load");
232 |             Arg.replaceAllUsesWith(NewVal);
233 |         } else if (IsV3) {
234 |             Value* Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2}, Arg.getName() + ".load");
235 |             Arg.replaceAllUsesWith(Shuf);
236 |         } else {
237 |             Load->setName(Arg.getName() + ".load");
238 |             Arg.replaceAllUsesWith(Load);
239 |         }
240 |     }
241 | 
242 |     KernArgSegment->addRetAttr(Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
243 | 
244 |     return PreservedAnalyses::none();
245 | }


--------------------------------------------------------------------------------
/src/pal/pal_lower_kernel_arguments_pass.h:
--------------------------------------------------------------------------------
 1 | #ifndef PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H
 2 | #define PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H
 3 | 
 4 | #include <llvm/IR/PassManager.h>
 5 | 
 6 | /// This pass replaces accesses to kernel arguments with loads from offsets from a manually supplied buffer
 7 | /// containing these arguments. The pointer to this buffer is expected to be prepopulated into specific sgprs
 8 | /// by the PALPlatform.
 9 | ///
10 | /// This pass is an almost 1:1 replicate of the AMDGPULowerKernelArguments pass
11 | /// (llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp)
12 | struct PalPlatformLowerKernelArgumentsPass : llvm::PassInfoMixin<PalPlatformLowerKernelArgumentsPass> {
13 |     PalPlatformLowerKernelArgumentsPass(){}
14 | 
15 |     PalPlatformLowerKernelArgumentsPass(const PalPlatformLowerKernelArgumentsPass& other) = default;
16 |     PalPlatformLowerKernelArgumentsPass& operator=(const PalPlatformLowerKernelArgumentsPass& other) = default;
17 |     PalPlatformLowerKernelArgumentsPass(PalPlatformLowerKernelArgumentsPass&& other) = default;
18 |     PalPlatformLowerKernelArgumentsPass& operator=(PalPlatformLowerKernelArgumentsPass&& other) = default;
19 |     ~PalPlatformLowerKernelArgumentsPass() = default;
20 | 
21 |     llvm::PreservedAnalyses run(llvm::Function& F, llvm::FunctionAnalysisManager& FAM);
22 | };
23 | 
24 | #endif // PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H


--------------------------------------------------------------------------------
/src/pal/pal_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef PAL_UTILS_H
 2 | #define PAL_UTILS_H
 3 | 
 4 | #include <pal.h>
 5 | #include <palDevice.h>
 6 | 
 7 | #include <llvm/BinaryFormat/MsgPackDocument.h>
 8 | #include <llvm/IR/CallingConv.h>
 9 | #include <llvm/IR/Function.h>
10 | #include <llvm/IR/LLVMContext.h>
11 | #include <llvm/IR/Module.h>
12 | #include <llvm/Support/SourceMgr.h>
13 | 
14 | namespace pal_utils {
15 | 
16 | std::string llvm_diagnostic_to_string(const llvm::SMDiagnostic& diagnostic_err);
17 | 
18 | struct ShaderSrc {
19 |     const std::string kernelname;
20 |     const std::string src_code;
21 |     const std::string filename;
22 |     const llvm::Function* function;
23 |     llvm::LLVMContext llvm_context;
24 |     std::unique_ptr<llvm::Module> llvm_module;
25 |     llvm::SMDiagnostic diagnostic_err;
26 | 
27 |     ShaderSrc(const std::string& filename, const std::string& src_code, const std::string& kernelname);
28 |     bool rename_entry_point();
29 | };
30 | 
31 | // Create the metadata that PAL expects to be attached to a kernel/shader binary.
32 | llvm::msgpack::Document build_metadata(const ShaderSrc& shader_src, Pal::GfxIpLevel gfx_level,
33 |     const std::array<uint64_t, 3>& thread_group_dimensions, uint32_t wavefront_size);
34 | 
35 | const char* get_gpu_name(const Pal::AsicRevision asic_revision);
36 | 
37 | const char* get_gfx_isa_id(const Pal::GfxIpLevel gfxip_level);
38 | 
39 | bool isAMDGPUEntryFunctionCC(llvm::CallingConv::ID CC);
40 | 
41 | void write_to_memory(
42 |     Pal::IGpuMemory* dst_memory, int64_t dst_memory_offset, const void* src_data, int64_t size);
43 | void read_from_memory(void* dst_buffer, Pal::IGpuMemory* src_memory, int64_t src_memory_offset, int64_t size);
44 | 
45 | // Returns a gpu-local memory heap that fits memory_size.
46 | // Order of importance: 1.GpuHeapInvisible, 2.GpuHeapLocal
47 | // Returns Pal::GpuHeap::GpuHeapCount if no appropriate heap can be found.
48 | Pal::GpuHeap find_gpu_local_heap(const Pal::IDevice* device, Pal::gpusize memory_size);
49 | 
50 | bool allocation_is_host_visible(Pal::IGpuMemory* gpu_allocation);
51 | 
52 | llvm::MDNode* get_metadata_mdnode(const llvm::Function* func, const char* key, int index = 0);
53 | llvm::StringRef get_metadata_string(const llvm::Function* func, const char* key);
54 | uint64_t get_metadata_uint(const llvm::Function* func, const char* key, int index = 0);
55 | 
56 | extern const char* ComputeShaderMainFnName;
57 | 
58 | } // namespace pal_utils
59 | 
60 | #define CHECK_PAL(err, name) { if (err != Pal::Result::Success) { error("PAL API function % [file %, line %]: %", name, __FILE__, __LINE__, static_cast<int32_t>(err)); } }
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/src/pal_platform.h:
--------------------------------------------------------------------------------
 1 | #ifndef PAL_PLATFORM_H
 2 | #define PAL_PLATFORM_H
 3 | 
 4 | #include "pal/pal_device.h"
 5 | #include "pal/pal_utils.h"
 6 | #include "platform.h"
 7 | #include "runtime.h"
 8 | 
 9 | #include <atomic>
10 | #include <functional>
11 | #include <string>
12 | #include <unordered_map>
13 | #include <vector>
14 | 
15 | #include <pal.h>
16 | #include <palLib.h>
17 | #include <palPipeline.h>
18 | #include <palPlatform.h>
19 | #include <palQueue.h>
20 | 
21 | #ifdef AnyDSL_runtime_HAS_LLVM_SUPPORT
22 | #include <llvm/Passes/OptimizationLevel.h>
23 | #endif
24 | 
25 | class PALPlatform : public Platform {
26 | public:
27 |     PALPlatform(Runtime* runtime);
28 |     ~PALPlatform();
29 | 
30 | protected:
31 |     void* alloc(DeviceId dev, int64_t size) override;
32 |     void* alloc_host(DeviceId dev, int64_t size) override;
33 |     void* alloc_unified(DeviceId dev, int64_t size) override;
34 |     void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
35 |     void release(DeviceId dev, void* ptr) override;
36 |     void release_host(DeviceId dev, void* ptr) override;
37 | 
38 |     void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
39 | 
40 |     void synchronize(DeviceId dev) override;
41 | 
42 |     void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst,
43 |         int64_t size) override;
44 |     void copy_from_host(
45 |         const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override;
46 |     void copy_to_host(
47 |         DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
48 | 
49 |     size_t dev_count() const override { return devices_.size(); }
50 |     std::string name() const override { return "PAL"; }
51 |     const char* device_name(DeviceId dev) const override;
52 |     bool device_check_feature_support(DeviceId, const char*) const override { return false; }
53 | 
54 |     Pal::IPipeline* load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
55 |     std::string compile_gcn(DeviceId dev, pal_utils::ShaderSrc&& shader_src) const;
56 |     std::string emit_gcn(pal_utils::ShaderSrc&& shader_src, const std::string& cpu,
57 |         Pal::GfxIpLevel gfx_level, llvm::OptimizationLevel opt) const;
58 | 
59 | protected:
60 |     Pal::IPlatform* platform_;
61 |     std::vector<PalDevice> devices_;
62 | };
63 | 
64 | #endif
65 | 


--------------------------------------------------------------------------------
/src/platform.h:
--------------------------------------------------------------------------------
 1 | #ifndef PLATFORM_H
 2 | #define PLATFORM_H
 3 | 
 4 | #include "anydsl_runtime_config.h"
 5 | #include "log.h"
 6 | #include "runtime.h"
 7 | 
 8 | #include <cstdint>
 9 | #include <string>
10 | 
11 | void register_cpu_platform(Runtime*);
12 | void register_cuda_platform(Runtime*);
13 | void register_opencl_platform(Runtime*);
14 | void register_hsa_platform(Runtime*);
15 | void register_pal_platform(Runtime*);
16 | void register_levelzero_platform(Runtime*);
17 | 
18 | /// A runtime platform. Exposes a set of devices, a copy function,
19 | /// and functions to allocate and release memory.
20 | class Platform {
21 | public:
22 |     Platform(Runtime* runtime)
23 |         : runtime_(runtime)
24 |     {}
25 | 
26 |     virtual ~Platform() {}
27 | 
28 |     /// Allocates memory for a device on this platform.
29 |     virtual void* alloc(DeviceId dev, int64_t size) = 0;
30 |     /// Allocates page-locked host memory for a platform (and a device).
31 |     virtual void* alloc_host(DeviceId dev, int64_t size) = 0;
32 |     /// Allocates unified memory for a platform (and a device).
33 |     virtual void* alloc_unified(DeviceId dev, int64_t size) = 0;
34 |     /// Returns the device memory associated with the page-locked memory.
35 |     virtual void* get_device_ptr(DeviceId dev, void* ptr) = 0;
36 |     /// Releases memory for a device on this platform.
37 |     virtual void release(DeviceId dev, void* ptr) = 0;
38 |     /// Releases page-locked host memory for a device on this platform.
39 |     virtual void release_host(DeviceId dev, void* ptr) = 0;
40 | 
41 |     /// Launches a kernel with the given block/grid size and arguments.
42 |     virtual void launch_kernel(DeviceId dev, const LaunchParams& launch_params) = 0;
43 |     /// Waits for the completion of all the launched kernels on the given device.
44 |     virtual void synchronize(DeviceId dev) = 0;
45 | 
46 |     /// Copies memory. Copy can only be performed devices in the same platform.
47 |     virtual void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0;
48 |     /// Copies memory from the host (CPU).
49 |     virtual void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0;
50 |     /// Copies memory to the host (CPU).
51 |     virtual void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) = 0;
52 | 
53 |     /// Returns the platform name.
54 |     virtual std::string name() const = 0;
55 |     /// Returns the number of devices in this platform.
56 |     virtual size_t dev_count() const = 0;
57 |     /// Returns the name of the given device.
58 |     virtual const char* device_name(DeviceId dev) const = 0;
59 |     /// Checks whether the given platform-specific feature is supported on the given device.
60 |     virtual bool device_check_feature_support(DeviceId dev, const char* feature) const = 0;
61 | 
62 | protected:
63 |     [[noreturn]] void platform_error() {
64 |         error("The selected '%' platform is not available", name());
65 |     }
66 | 
67 |     [[noreturn]] void command_unavailable(const std::string& command) {
68 |         error("The command '%' is unavailable on platform '%'", command, name());
69 |     }
70 | 
71 |     Runtime* runtime_;
72 | };
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/runtime.cpp:
--------------------------------------------------------------------------------
  1 | #include <sstream>
  2 | #include <fstream>
  3 | 
  4 | #include "anydsl_runtime.h"
  5 | 
  6 | #include "runtime.h"
  7 | #include "platform.h"
  8 | #include "dummy_platform.h"
  9 | #include "cpu_platform.h"
 10 | 
 11 | #ifndef AnyDSL_runtime_HAS_CUDA_SUPPORT
 12 | void register_cuda_platform(Runtime* runtime) { runtime->register_platform<DummyPlatform>("CUDA"); }
 13 | #endif
 14 | #ifndef AnyDSL_runtime_HAS_OPENCL_SUPPORT
 15 | void register_opencl_platform(Runtime* runtime) { runtime->register_platform<DummyPlatform>("OpenCL"); }
 16 | #endif
 17 | #ifndef AnyDSL_runtime_HAS_HSA_SUPPORT
 18 | void register_hsa_platform(Runtime* runtime) { runtime->register_platform<DummyPlatform>("HSA"); }
 19 | #endif
 20 | #ifndef AnyDSL_runtime_HAS_PAL_SUPPORT
 21 | void register_pal_platform(Runtime* runtime) { runtime->register_platform<DummyPlatform>("PAL"); }
 22 | #endif
 23 | #ifndef AnyDSL_runtime_HAS_LEVELZERO_SUPPORT
 24 | void register_levelzero_platform(Runtime* runtime) { runtime->register_platform<DummyPlatform>("Level Zero"); }
 25 | #endif
 26 | 
 27 | Runtime::Runtime(std::pair<ProfileLevel, ProfileLevel> profile)
 28 |     : profile_(profile)
 29 |     , cache_dir_("")
 30 | {}
 31 | 
 32 | void Runtime::display_info() const {
 33 |     info("Available platforms:");
 34 |     for (auto& p: platforms_) {
 35 |         info("    * %: % device(s)", p->name(), p->dev_count());
 36 |         for (size_t d=0; d<p->dev_count(); ++d)
 37 |             info("      + (%) %", d, p->device_name(DeviceId(d)));
 38 |     }
 39 | }
 40 | 
 41 | const char* Runtime::device_name(PlatformId plat, DeviceId dev) const {
 42 |     check_device(plat, dev);
 43 |     return platforms_[plat]->device_name(dev);
 44 | }
 45 | 
 46 | bool Runtime::device_check_feature_support(PlatformId plat, DeviceId dev, const char* feature) const {
 47 |     check_device(plat, dev);
 48 |     return platforms_[plat]->device_check_feature_support(dev, feature);
 49 | }
 50 | 
 51 | void* Runtime::alloc(PlatformId plat, DeviceId dev, int64_t size) {
 52 |     check_device(plat, dev);
 53 |     return platforms_[plat]->alloc(dev, size);
 54 | }
 55 | 
 56 | void* Runtime::alloc_host(PlatformId plat, DeviceId dev, int64_t size) {
 57 |     check_device(plat, dev);
 58 |     return platforms_[plat]->alloc_host(dev, size);
 59 | }
 60 | 
 61 | void* Runtime::alloc_unified(PlatformId plat, DeviceId dev, int64_t size) {
 62 |     check_device(plat, dev);
 63 |     return platforms_[plat]->alloc_unified(dev, size);
 64 | }
 65 | 
 66 | void* Runtime::get_device_ptr(PlatformId plat, DeviceId dev, void* ptr) {
 67 |     check_device(plat, dev);
 68 |     return platforms_[plat]->get_device_ptr(dev, ptr);
 69 | }
 70 | 
 71 | void Runtime::release(PlatformId plat, DeviceId dev, void* ptr) {
 72 |     check_device(plat, dev);
 73 |     platforms_[plat]->release(dev, ptr);
 74 | }
 75 | 
 76 | void Runtime::release_host(PlatformId plat, DeviceId dev, void* ptr) {
 77 |     check_device(plat, dev);
 78 |     platforms_[plat]->release_host(dev, ptr);
 79 | }
 80 | 
 81 | void Runtime::copy(
 82 |     PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src,
 83 |     PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
 84 |     check_device(plat_src, dev_src);
 85 |     check_device(plat_dst, dev_dst);
 86 |     if (plat_src == plat_dst) {
 87 |         // Copy from same platform
 88 |         platforms_[plat_src]->copy(dev_src, src, offset_src, dev_dst, dst, offset_dst, size);
 89 |         debug("Copy between devices % and % on platform %", dev_src, dev_dst, plat_src);
 90 |     } else {
 91 |         // Copy from another platform
 92 |         if (plat_src == 0) {
 93 |             // Source is the CPU platform
 94 |             platforms_[plat_dst]->copy_from_host(src, offset_src, dev_dst, dst, offset_dst, size);
 95 |             debug("Copy from host to device % on platform %", dev_dst, plat_dst);
 96 |         } else if (plat_dst == 0) {
 97 |             // Destination is the CPU platform
 98 |             platforms_[plat_src]->copy_to_host(dev_src, src, offset_src, dst, offset_dst, size);
 99 |             debug("Copy to host from device % on platform %", dev_src, plat_src);
100 |         } else {
101 |             error("Cannot copy memory between different platforms");
102 |         }
103 |     }
104 | }
105 | 
106 | void Runtime::launch_kernel(PlatformId plat, DeviceId dev, const LaunchParams& launch_params) {
107 |     check_device(plat, dev);
108 |     assert(launch_params.grid[0] > 0 && launch_params.grid[0] % launch_params.block[0] == 0 &&
109 |            launch_params.grid[1] > 0 && launch_params.grid[1] % launch_params.block[1] == 0 &&
110 |            launch_params.grid[2] > 0 && launch_params.grid[2] % launch_params.block[2] == 0 &&
111 |            "The grid size is not a multiple of the block size");
112 |     platforms_[plat]->launch_kernel(dev, launch_params);
113 | }
114 | 
115 | void Runtime::synchronize(PlatformId plat, DeviceId dev) {
116 |     check_device(plat, dev);
117 |     platforms_[plat]->synchronize(dev);
118 | }
119 | 
120 | #ifdef _WIN32
121 | #include <direct.h>
122 | #define PATH_DIR_SEPARATOR '\\'
123 | #define create_directory(d) _mkdir(d)
124 | #else
125 | #include <sys/stat.h>
126 | #include <unistd.h>
127 | #include <limits.h>
128 | #define PATH_DIR_SEPARATOR '/'
129 | #define create_directory(d) { umask(0); mkdir(d, 0755); }
130 | #endif
131 | 
132 | #if _XOPEN_SOURCE >= 500 || _POSIX_C_SOURCE >= 200112L || /* Glibc versions <= 2.19: */ _BSD_SOURCE
133 | static std::string get_self_directory() {
134 |     char path[PATH_MAX];
135 |     ssize_t len = readlink("/proc/self/exe", path, sizeof(path)-1);
136 |     if (len != -1) {
137 |         path[len] = '\0';
138 | 
139 |         for (int i = len-1; i >= 0; --i) {
140 |             if (path[i] == PATH_DIR_SEPARATOR)
141 |                 return std::string(&path[0], i);
142 |         }
143 |     }
144 |     return std::string();
145 | }
146 | #elif defined(__APPLE__)
147 | #include <mach-o/dyld.h>
148 | static std::string get_self_directory() {
149 |     char path[PATH_MAX];
150 |     uint32_t size = (uint32_t)sizeof(path);
151 |     if (_NSGetExecutablePath(path, &size) == 0) {
152 |         char resolved[PATH_MAX];
153 |         if (realpath(path, resolved)) {
154 |             std::string resolved_path = std::string(resolved);
155 |             for (int i = resolved_path.size()-1; i >= 0; --i) {
156 |                 if (resolved_path[i] == PATH_DIR_SEPARATOR)
157 |                     return std::string(resolved_path, 0, i);
158 |             }
159 |         }
160 |     }
161 |     return std::string();
162 | }
163 | #elif defined(_WIN32)
164 | #include <Windows.h>
165 | static std::string get_self_directory() {
166 |     CHAR path[MAX_PATH];
167 |     DWORD nSize = (DWORD)sizeof(path);
168 |     DWORD length = GetModuleFileNameA(NULL, path, nSize);
169 |     if ((length == 0) || (length == MAX_PATH))
170 |         return std::string();
171 | 
172 |     std::string resolved_path(path);
173 |     for (std::size_t i = resolved_path.size() - 1; i >= 0; --i) {
174 |         if (resolved_path[i] == PATH_DIR_SEPARATOR)
175 |             return std::string(resolved_path, 0, i);
176 |     }
177 | 
178 |     return std::string();
179 | }
180 | #else
181 | static std::string get_self_directory() {
182 |     return std::string();
183 | }
184 | #endif
185 | 
186 | void Runtime::set_cache_directory(const std::string& dir) {
187 |     cache_dir_ = dir;
188 | }
189 | 
190 | std::string Runtime::get_cache_directory() const {
191 |     if (cache_dir_.empty()) {
192 |         std::string cache_path = get_self_directory();
193 |         if (!cache_path.empty())
194 |             cache_path += PATH_DIR_SEPARATOR;
195 |         return cache_path + "cache";
196 |     } else {
197 |         return cache_dir_;
198 |     }
199 | }
200 | 
201 | std::string Runtime::get_cached_filename(const std::string& str, const std::string& ext) const {
202 |     size_t key = std::hash<std::string>{}(str);
203 |     std::stringstream hex_stream;
204 |     hex_stream << std::hex << key;
205 |     return get_cache_directory() + PATH_DIR_SEPARATOR + hex_stream.str() + ext;
206 | }
207 | 
208 | inline std::string read_stream(std::istream& stream) {
209 |     return std::string(std::istreambuf_iterator<char>(stream), std::istreambuf_iterator<char>());
210 | }
211 | 
212 | std::string Runtime::load_file(const std::string& filename) const {
213 |     auto file_it = files_.find(filename);
214 |     if (file_it != files_.end())
215 |         return file_it->second;
216 | 
217 |     std::ifstream src_file(filename);
218 |     if (!src_file)
219 |         error("Can't open source file '%'", filename);
220 |     return read_stream(src_file);
221 | }
222 | 
223 | void Runtime::store_file(const std::string& filename, const std::string& str) const {
224 |     store_file(filename, reinterpret_cast<const std::byte*>(str.data()), str.length());
225 | }
226 | 
227 | void Runtime::store_file(const std::string& filename, const std::byte* data, size_t size) const {
228 |     std::ofstream dst_file(filename, std::ofstream::binary);
229 |     if (!dst_file)
230 |         error("Can't open destination file '%'", filename);
231 |     dst_file.write(reinterpret_cast<const char*>(data), size);
232 | }
233 | 
234 | std::string Runtime::load_from_cache(const std::string& key, const std::string& ext) const {
235 |     std::string filename = get_cached_filename(key, ext);
236 |     std::ifstream src_file(filename, std::ifstream::binary);
237 |     if (!src_file.is_open())
238 |         return std::string();
239 |     // prevent collision by storing the key in the cached file
240 |     size_t size = 0;
241 |     if (!src_file.read(reinterpret_cast<char*>(&size), sizeof(size_t)))
242 |         return std::string();
243 |     auto buf = std::make_unique<char[]>(size);
244 |     if (!src_file.read(buf.get(), size))
245 |         return std::string();
246 |     if (std::memcmp(key.data(), buf.get(), size))
247 |         return std::string();
248 |     debug("Loading from cache: %", filename);
249 |     return read_stream(src_file);
250 | }
251 | 
252 | void Runtime::store_to_cache(const std::string& key, const std::string& str, const std::string ext) const {
253 |     std::string filename = get_cached_filename(key, ext);
254 |     create_directory(get_cache_directory().c_str());
255 |     debug("Storing to cache: %", filename);
256 |     std::ofstream dst_file(filename, std::ofstream::binary);
257 |     size_t size = key.size();
258 |     dst_file.write(reinterpret_cast<char*>(&size), sizeof(size_t));
259 |     dst_file.write(key.data(), size);
260 |     dst_file.write(str.data(), str.size());
261 | }
262 | 
263 | #if _POSIX_VERSION >= 200112L || _XOPEN_SOURCE >= 600
264 | void* Runtime::aligned_malloc(size_t size, size_t alignment) {
265 |     void* p = nullptr;
266 |     posix_memalign(&p, alignment, size);
267 |     return p;
268 | }
269 | void Runtime::aligned_free(void* ptr) {
270 |     free(ptr);
271 | }
272 | #elif _ISOC11_SOURCE
273 | void* Runtime::aligned_malloc(size_t size, size_t alignment) {
274 |     return ::aligned_alloc(alignment, size);
275 | }
276 | void Runtime::aligned_free(void* ptr) {
277 |     ::free(ptr);
278 | }
279 | #elif defined(_WIN32) || defined(__CYGWIN__)
280 | #include <malloc.h>
281 | void* Runtime::aligned_malloc(size_t size, size_t alignment) {
282 |     return ::_aligned_malloc(size, alignment);
283 | }
284 | void Runtime::aligned_free(void* ptr) {
285 |     ::_aligned_free(ptr);
286 | }
287 | #else
288 | #error "There is no way to allocate aligned memory on this system"
289 | #endif
290 | 
291 | void Runtime::check_device(PlatformId plat, DeviceId dev) const {
292 |     assert((size_t)dev < platforms_[plat]->dev_count() && "Invalid device");
293 |     unused(plat, dev);
294 | }
295 | 


--------------------------------------------------------------------------------
/src/runtime.h:
--------------------------------------------------------------------------------
  1 | #ifndef RUNTIME_H
  2 | #define RUNTIME_H
  3 | 
  4 | #include <cassert>
  5 | #include <cstddef>
  6 | #include <cstdlib>
  7 | #include <iostream>
  8 | #include <string>
  9 | #include <unordered_map>
 10 | #include <vector>
 11 | #include <atomic>
 12 | #include <memory>
 13 | 
 14 | #include "log.h"
 15 | 
 16 | enum DeviceId   : uint32_t {};
 17 | enum PlatformId : uint32_t {};
 18 | enum class ProfileLevel : uint8_t { None = 0, Full, Fpga_dynamic };
 19 | 
 20 | class Platform;
 21 | 
 22 | enum class KernelArgType : uint8_t { Val = 0, Ptr, Struct };
 23 | 
 24 | struct ParamsArgs {
 25 |     void** data;
 26 |     const uint32_t* sizes;
 27 |     const uint32_t* aligns;
 28 |     const uint32_t* alloc_sizes;
 29 |     const KernelArgType* types;
 30 | };
 31 | 
 32 | /// The parameters to a `anydsl_launch_kernel()` call.
 33 | struct LaunchParams {
 34 |     const char* file_name;
 35 |     const char* kernel_name;
 36 |     const uint32_t* grid;
 37 |     const uint32_t* block;
 38 |     ParamsArgs args;
 39 |     uint32_t num_args;
 40 | };
 41 | 
 42 | class Runtime {
 43 | public:
 44 |     Runtime(std::pair<ProfileLevel, ProfileLevel>);
 45 | 
 46 |     /// Registers the given platform into the runtime.
 47 |     template <typename T, typename... Args>
 48 |     void register_platform(Args&&... args) {
 49 |         platforms_.emplace_back(new T(this, std::forward<Args&&>(args)...));
 50 |     }
 51 | 
 52 |     /// Displays available platforms.
 53 |     void display_info() const;
 54 | 
 55 |     /// Returns name of device.
 56 |     const char* device_name(PlatformId, DeviceId) const;
 57 |     /// Checks whether feature is supported on device.
 58 |     bool device_check_feature_support(PlatformId, DeviceId, const char*) const;
 59 | 
 60 |     /// Allocates memory on the given device.
 61 |     void* alloc(PlatformId plat, DeviceId dev, int64_t size);
 62 |     /// Allocates page-locked memory on the given platform and device.
 63 |     void* alloc_host(PlatformId plat, DeviceId dev, int64_t size);
 64 |     /// Allocates unified memory on the given platform and device.
 65 |     void* alloc_unified(PlatformId plat, DeviceId dev, int64_t size);
 66 |     /// Returns the device memory associated with the page-locked memory.
 67 |     void* get_device_ptr(PlatformId plat, DeviceId dev, void* ptr);
 68 |     /// Releases memory.
 69 |     void release(PlatformId plat, DeviceId dev, void* ptr);
 70 |     /// Releases previously allocated page-locked memory.
 71 |     void release_host(PlatformId plat, DeviceId dev, void* ptr);
 72 |     /// Copies memory between devices.
 73 |     void copy(
 74 |         PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src,
 75 |         PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size);
 76 | 
 77 |     /// Launches a kernel on the platform and device.
 78 |     void launch_kernel(PlatformId plat, DeviceId dev, const LaunchParams& launch_params);
 79 |     /// Waits for the completion of all kernels on the given platform and device.
 80 |     void synchronize(PlatformId plat, DeviceId dev);
 81 | 
 82 |     /// Associate a program string to a given filename.
 83 |     void register_file(const std::string& filename, const std::string& program_string) {
 84 |         files_[filename] = program_string;
 85 |     }
 86 | 
 87 |     std::string load_file(const std::string& filename) const;
 88 |     void store_file(const std::string& filename, const std::string& str) const;
 89 |     void store_file(const std::string& filename, const std::byte* data, size_t size) const;
 90 | 
 91 |     /// Set an optional directory for generated cache data. If not specified, or empty, an internal directory will be used. User has to make sure the directory exists.
 92 |     void set_cache_directory(const std::string& dir);
 93 |     std::string get_cache_directory() const;
 94 | 
 95 |     std::string load_from_cache(const std::string& str, const std::string& ext=".bin") const;
 96 |     void store_to_cache(const std::string& key, const std::string& str, const std::string ext=".bin") const;
 97 | 
 98 |     bool profiling_enabled() { return profile_.first == ProfileLevel::Full; }
 99 |     bool dynamic_profiling_enabled() { return profile_.second == ProfileLevel::Fpga_dynamic; }
100 |     std::atomic<uint64_t>& kernel_time() { return kernel_time_; }
101 | 
102 |     static void* aligned_malloc(size_t, size_t);
103 |     static void aligned_free(void*);
104 | 
105 | private:
106 |     void check_device(PlatformId, DeviceId) const;
107 |     std::string get_cached_filename(const std::string& str, const std::string& ext) const;
108 | 
109 |     std::pair<ProfileLevel, ProfileLevel> profile_;
110 |     std::atomic<uint64_t> kernel_time_;
111 |     std::vector<std::unique_ptr<Platform>> platforms_;
112 |     std::unordered_map<std::string, std::string> files_;
113 |     std::string cache_dir_;
114 | };
115 | 
116 | #endif
117 | 


--------------------------------------------------------------------------------