├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTORS.txt
├── LICENSE.TXT
├── README.md
├── cmake
├── anydsl_runtime-config.cmake.in
├── build_xilinx_hls.cmake.in
├── check_nvvmir.cmake
└── modules
│ ├── FindLevelZero.cmake
│ └── FindXHLS.cmake
├── platforms
├── artic
│ ├── intrinsics.impala
│ ├── intrinsics_amdgpu.impala
│ ├── intrinsics_cpu.impala
│ ├── intrinsics_cuda.impala
│ ├── intrinsics_hls.impala
│ ├── intrinsics_levelzero.impala
│ ├── intrinsics_math.impala
│ ├── intrinsics_nvvm.impala
│ ├── intrinsics_opencl.impala
│ ├── intrinsics_rv.impala
│ ├── intrinsics_spirv.impala
│ ├── intrinsics_thorin.impala
│ ├── intrinsics_wmma.impala
│ └── runtime.impala
└── impala
│ ├── intrinsics.impala
│ ├── intrinsics_amdgpu.impala
│ ├── intrinsics_cpu.impala
│ ├── intrinsics_cuda.impala
│ ├── intrinsics_hls.impala
│ ├── intrinsics_nvvm.impala
│ ├── intrinsics_opencl.impala
│ ├── intrinsics_rv.impala
│ ├── intrinsics_thorin.impala
│ └── runtime.impala
├── post-patcher.py
└── src
├── CMakeLists.txt
├── anydsl_jit.h
├── anydsl_runtime.cpp
├── anydsl_runtime.h
├── anydsl_runtime.hpp
├── anydsl_runtime_config.h.in
├── cpu_platform.cpp
├── cpu_platform.h
├── cuda_platform.cpp
├── cuda_platform.h
├── dummy_platform.h
├── extract_runtime_srcs.py
├── hsa_platform.cpp
├── hsa_platform.h
├── jit.cpp
├── levelzero_platform.cpp
├── levelzero_platform.h
├── log.h
├── opencl_platform.cpp
├── opencl_platform.h
├── pal
├── pal_device.cpp
├── pal_device.h
├── pal_fix_calling_convention_pass.cpp
├── pal_fix_calling_convention_pass.h
├── pal_insert_halt_pass.cpp
├── pal_insert_halt_pass.h
├── pal_lower_builtins_pass.cpp
├── pal_lower_builtins_pass.h
├── pal_lower_kernel_arguments_pass.cpp
├── pal_lower_kernel_arguments_pass.h
├── pal_utils.cpp
└── pal_utils.h
├── pal_platform.cpp
├── pal_platform.h
├── platform.h
├── runtime.cpp
└── runtime.h
/.gitignore:
--------------------------------------------------------------------------------
1 | build*
2 |
3 | .vscode
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
2 |
3 | project(AnyDSL-runtime)
4 |
5 | set(PACKAGE_VERSION "0.3.9")
6 | #set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "limited config" FORCE)
7 | set(CMAKE_CXX_STANDARD 17)
8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
9 |
10 | option(BUILD_SHARED_LIBS "Build shared libraries" ON)
11 | option(RUNTIME_JIT "enable jit support in the runtime" OFF)
12 | option(DEBUG_OUTPUT "enable debug output" OFF)
13 |
14 | if(CMAKE_BUILD_TYPE STREQUAL "")
15 | set(CMAKE_BUILD_TYPE Debug CACHE STRING "Debug or Release" FORCE)
16 | endif()
17 |
18 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules)
19 |
20 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
21 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
22 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
23 |
24 | find_path(Artic_DIR NAMES artic-config.cmake PATHS ${Artic_DIR} $ENV{Artic_DIR} ${CMAKE_BINARY_DIR}/share/anydsl/cmake)
25 | find_path(Impala_DIR NAMES impala-config.cmake PATHS ${Impala_DIR} $ENV{Impala_DIR} ${CMAKE_BINARY_DIR}/share/anydsl/cmake)
26 |
27 | set(AnyDSL_runtime_ENABLE_DEBUG_OUTPUT ${DEBUG_OUTPUT})
28 | set(AnyDSL_runtime_TARGET_NAME runtime CACHE STRING "Name of the cmake target for the AnyDSL runtime")
29 | mark_as_advanced(AnyDSL_runtime_TARGET_NAME)
30 |
31 | add_subdirectory(src)
32 |
33 | message(STATUS "Using Debug flags: ${CMAKE_CXX_FLAGS_DEBUG}")
34 | message(STATUS "Using Release flags: ${CMAKE_CXX_FLAGS_RELEASE}")
35 | if(DEFINED CMAKE_BUILD_TYPE)
36 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
37 | endif()
38 |
39 | export(TARGETS ${RUNTIME_LIBRARIES} FILE ${CMAKE_BINARY_DIR}/share/anydsl/cmake/anydsl_runtime-exports.cmake)
40 | configure_file(cmake/anydsl_runtime-config.cmake.in ${CMAKE_BINARY_DIR}/share/anydsl/cmake/anydsl_runtime-config.cmake @ONLY)
41 | configure_file(cmake/build_xilinx_hls.cmake.in ${CMAKE_BINARY_DIR}/share/anydsl/cmake/build_xilinx_hls.cmake @ONLY)
42 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.txt:
--------------------------------------------------------------------------------
1 | # This is the official list of contributing authors in the AnyDSL runtime project for copyright purposes.
2 |
3 | # Name (GitHub Handle), Affiliation(s)
4 | Puya Amiri (pooyaww), DFKI
5 | Hugo Devillers (Hugobros3), Saarland University
6 | Pascal Grittmann (pgrit), Saarland University
7 | Ralf Jung (RalfJung), Saarland University
8 | Michael Kenzel (michael-kenzel), DFKI
9 | Marcel Köster (m4rs-mt), Saarland University
10 | Matthis Kruse (DasNaCl), Saarland University
11 | Matthias Kurtenacker (m-kurtenacker), DFKI
12 | Roland Leißa (leissa), Saarland University
13 | Stefan Lemme (stlemme), Saarland University / DFKI
14 | Richard Membarth (richardmembarth), Saarland University / DFKI / Technische Hochschule Ingolstadt
15 | Simon Moll (simoll), Saarland University
16 | Arsène Pérard-Gayot (madmann91), Saarland University
17 | Akif Özkan (akifoezkan), Friedrich-Alexander-University Erlangen-Nuremberg
18 | Alexander Rath (iRath96), DFKI
19 | Till Speicher (tillspeicher), Saarland University
20 | Fabian Wildgrube (FabianWildgrube), Advanced Micro Devices Inc.
21 | Ömercan Yazici (PearCoding), Saarland University
22 |
--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AnyDSL Runtime Library
2 | The runtime for the AnyDSL framework and its two frontends [artic](https://github.com/AnyDSL/artic) and [impala](https://github.com/AnyDSL/impala).
3 |
4 | The runtime provides the following components:
5 | - CMake logic to build programs using artic or impala
6 | - include files for basic runtime abstractions and math functions
7 | - runtime library implementation to schedule and execute AnyDSL programs on different platforms
8 | + Host (CPU): standard platform for code
9 | + TBB / C++11 threads: code emitted by `parallel`
10 | + LLVM w/ RV support: code emitted by `vectorize`
11 | + CUDA: code emitted by `cuda` or `nvvm`
12 | + OpenCL: code emitted by `opencl`
13 | + HSA: code emitted by `amdgpu`
14 |
15 | CMake automatically search for available components on the current system.
16 | To prevent CMake from building a particular runtime component, disable it using CMake's `CMAKE_DISABLE_FIND_PACKAGE_` variable.
17 | For example, pass `-DCMAKE_DISABLE_FIND_PACKAGE_OpenCL=TRUE` to cmake to disable the OpenCL runtime component.
18 |
19 | Although not required, feel free to specify `Artic_DIR` or `Impala_DIR` for your convenience to later automatically find the correct paths when building AnyDSL programs using the `anydsl_runtime_wrap()` function.
20 |
21 | To enable JIT support, please pass `-DRUNTIME_JIT=ON` to cmake.
22 | This will require atleast one of artic or impala as dependencies and thereby locate LLVM as well as [thorin](https://github.com/AnyDSL/thorin) too.
23 |
--------------------------------------------------------------------------------
/cmake/check_nvvmir.cmake:
--------------------------------------------------------------------------------
1 | if(EXISTS ${_basename}.nvvm)
2 | execute_process(COMMAND ${LLVM_AS_BIN} ${_basename}.nvvm)
3 | endif()
4 |
--------------------------------------------------------------------------------
/cmake/modules/FindLevelZero.cmake:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2019 Intel Corporation
2 | # SPDX-License-Identifier: MIT
3 | include(FindPackageHandleStandardArgs)
4 |
5 | find_path(LevelZero_INCLUDE_DIR
6 | NAMES level_zero/ze_api.h
7 | )
8 |
9 | find_library(LevelZero_LIBRARY
10 | NAMES ze_loader ze_loader32 ze_loader64
11 | )
12 |
13 | find_package_handle_standard_args(LevelZero
14 | REQUIRED_VARS
15 | LevelZero_INCLUDE_DIR
16 | LevelZero_LIBRARY
17 | HANDLE_COMPONENTS
18 | )
19 | mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIR)
20 |
21 | if(LevelZero_FOUND)
22 | list(APPEND LevelZero_LIBRARIES ${LevelZero_LIBRARY} ${CMAKE_DL_LIBS})
23 | list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
24 | if(OpenCL_FOUND)
25 | list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
26 | endif()
27 | MESSAGE(STATUS "Found Level Zero in " ${LevelZero_INCLUDE_DIR})
28 | endif()
29 |
30 | if(LevelZero_FOUND AND NOT TARGET LevelZero::LevelZero)
31 | add_library(LevelZero::LevelZero INTERFACE IMPORTED)
32 | set_target_properties(LevelZero::LevelZero
33 | PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
34 | )
35 | set_target_properties(LevelZero::LevelZero
36 | PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
37 | )
38 | endif()
39 |
40 | # MESSAGE(STATUS "LevelZero_LIBRARIES: " ${LevelZero_LIBRARIES})
41 | # MESSAGE(STATUS "LevelZero_INCLUDE_DIRS: " ${LevelZero_INCLUDE_DIRS})
42 |
--------------------------------------------------------------------------------
/cmake/modules/FindXHLS.cmake:
--------------------------------------------------------------------------------
1 | # Xilinx Runtime library (XRT) and HLS tools for scripting mode
2 |
3 | find_path(XILINX_SEARCH_PATH v++ PATHS ENV XILINX_OPENCL ENV XILINX_VITIS PATH_SUFFIXES bin)
4 | get_filename_component(VITIS_ROOT_DIR ${XILINX_SEARCH_PATH} DIRECTORY)
5 |
6 | find_program(Xilinx_VPP v++ PATHS ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
7 | find_program(Xilinx_PLATFORM_INFO platforminfo PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
8 | find_program(Xilinx_KERNEL_INFO kernelinfo PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
9 | find_program(Xilinx_EMU_CONFIG emconfigutil PATH ${VITIS_ROOT_DIR}/bin NO_DEFAULT_PATH)
10 |
11 |
12 | get_filename_component(VITIS_VERSION "${VITIS_ROOT_DIR}" NAME)
13 | string(REGEX REPLACE "([0-9]+)\\.[0-9]+" "\\1" VITIS_MAJOR_VERSION "${VITIS_VERSION}")
14 | string(REGEX REPLACE "[0-9]+\\.([0-9]+)" "\\1" VITIS_MINOR_VERSION "${VITIS_VERSION}")
15 | set(Vitis_VERSION ${VITIS_VERSION})
16 | set(Vitis_MAJOR_VERSION ${VITIS_MAJOR_VERSION})
17 | set(Vitis_MINOR_VERSION ${VITIS_MINOR_VERSION})
18 |
19 | find_program(Xilinx_HLS NAMES vitis_hls PATHS ${VITIS_ROOT_DIR}/bin ${VITIS_ROOT_DIR}/../../Vitis_HLS/${Vitis_VERSION}/bin NO_DEFAULT_PATH)
20 |
21 | find_path(Xilinx_HLS_INCLUDE_DIR hls_stream.h PATHS ${VITIS_ROOT_DIR}/include NO_DEFAULT_PATH)
22 |
23 | find_path(XRT_SEARCH_PATH libxilinxopencl.so PATHS /opt/xilinx/xrt ENV XILINX_XRT PATH_SUFFIXES lib)
24 | get_filename_component(XILINX_RUNTIME_DIR ${XRT_SEARCH_PATH} DIRECTORY)
25 | file(GLOB Xilinx_LIBRARIES ${XILINX_RUNTIME_DIR}/lib/libxilinxopencl.so)
26 |
27 | find_path(Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR cl_ext.h PATHS ${XILINX_RUNTIME_DIR}/include PATH_SUFFIXES CL NO_DEFAULT_PATH)
28 | get_filename_component(Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR ${Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR} DIRECTORY)
29 | set(Xilinx_INCLUDE_DIRS ${Xilinx_HLS_INCLUDE_DIR} ${Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR})
30 |
31 | mark_as_advanced(
32 | XILINX_RUNTIME_DIR
33 | XRT_SEARCH_PATH
34 | XILINX_SEARCH_PATH
35 | Xilinx_HLS
36 | Xilinx_VPP
37 | Xilinx_HLS_INCLUDE_DIR
38 | Xilinx_OPENCL_EXTENSIONS_INCLUDE_DIR
39 | Xilinx_PLATFORM_INFO
40 | Xilinx_KERNEL_INFO
41 | Xilinx_EMU_CONFIG
42 | Xilinx_LIBRARIES
43 | Xilinx_INCLUDE_DIRS)
44 |
45 | include(FindPackageHandleStandardArgs)
46 | find_package_handle_standard_args(XHLS DEFAULT_MSG
47 | Xilinx_HLS
48 | Xilinx_VPP
49 | Xilinx_LIBRARIES
50 | Xilinx_INCLUDE_DIRS
51 | Xilinx_PLATFORM_INFO
52 | Xilinx_KERNEL_INFO
53 | Xilinx_EMU_CONFIG
54 | )
55 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics.impala:
--------------------------------------------------------------------------------
1 | struct WorkItem {
2 | tidx : fn() -> i32,
3 | tidy : fn() -> i32,
4 | tidz : fn() -> i32,
5 | bidx : fn() -> i32,
6 | bidy : fn() -> i32,
7 | bidz : fn() -> i32,
8 | gidx : fn() -> i32,
9 | gidy : fn() -> i32,
10 | gidz : fn() -> i32,
11 | bdimx : fn() -> i32,
12 | bdimy : fn() -> i32,
13 | bdimz : fn() -> i32,
14 | gdimx : fn() -> i32,
15 | gdimy : fn() -> i32,
16 | gdimz : fn() -> i32,
17 | nblkx : fn() -> i32,
18 | nblky : fn() -> i32,
19 | nblkz : fn() -> i32
20 | }
21 |
22 | struct Accelerator {
23 | exec : fn(fn(WorkItem) -> ()) -> fn((i32, i32, i32), (i32, i32, i32)) -> (), // fn(grid, block)->()
24 | sync : fn() -> (),
25 | alloc : fn(i64) -> Buffer,
26 | alloc_unified : fn(i64) -> Buffer,
27 | barrier : fn() -> ()
28 | }
29 |
30 | struct Intrinsics {
31 | expf : fn(f32) -> f32,
32 | exp2f : fn(f32) -> f32,
33 | logf : fn(f32) -> f32,
34 | log2f : fn(f32) -> f32,
35 | powf : fn(f32, f32) -> f32,
36 | rsqrtf : fn(f32) -> f32,
37 | sqrtf : fn(f32) -> f32,
38 | fabsf : fn(f32) -> f32,
39 | sinf : fn(f32) -> f32,
40 | cosf : fn(f32) -> f32,
41 | tanf : fn(f32) -> f32,
42 | asinf : fn(f32) -> f32,
43 | acosf : fn(f32) -> f32,
44 | atanf : fn(f32) -> f32,
45 | erff : fn(f32) -> f32,
46 | atan2f : fn(f32, f32) -> f32,
47 | copysignf : fn(f32, f32) -> f32,
48 | fmaf : fn(f32, f32, f32) -> f32,
49 | fmaxf : fn(f32, f32) -> f32,
50 | fminf : fn(f32, f32) -> f32,
51 | fmodf : fn(f32, f32) -> f32,
52 | floorf : fn(f32) -> f32,
53 | isinff : fn(f32) -> i32,
54 | isnanf : fn(f32) -> i32,
55 | isfinitef : fn(f32) -> i32,
56 | exp : fn(f64) -> f64,
57 | exp2 : fn(f64) -> f64,
58 | log : fn(f64) -> f64,
59 | log2 : fn(f64) -> f64,
60 | pow : fn(f64, f64) -> f64,
61 | rsqrt : fn(f64) -> f64,
62 | sqrt : fn(f64) -> f64,
63 | fabs : fn(f64) -> f64,
64 | sin : fn(f64) -> f64,
65 | cos : fn(f64) -> f64,
66 | tan : fn(f64) -> f64,
67 | asin : fn(f64) -> f64,
68 | acos : fn(f64) -> f64,
69 | atan : fn(f64) -> f64,
70 | erf : fn(f64) -> f64,
71 | atan2 : fn(f64, f64) -> f64,
72 | copysign : fn(f64, f64) -> f64,
73 | fma : fn(f64, f64, f64) -> f64,
74 | fmax : fn(f64, f64) -> f64,
75 | fmin : fn(f64, f64) -> f64,
76 | fmod : fn(f64, f64) -> f64,
77 | floor : fn(f64) -> f64,
78 | isinf : fn(f64) -> i32,
79 | isnan : fn(f64) -> i32,
80 | isfinite : fn(f64) -> i32,
81 | min : fn(i32, i32) -> i32,
82 | max : fn(i32, i32) -> i32,
83 | }
84 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_cpu.impala:
--------------------------------------------------------------------------------
1 | //#[import(cc = "C", name = "sinf")] fn cpu_sinf(f32) -> f32;
2 | //#[import(cc = "C", name = "cosf")] fn cpu_cosf(f32) -> f32;
3 | #[import(cc = "C", name = "tanf")] fn cpu_tanf(_: f32) -> f32;
4 | #[import(cc = "C", name = "asinf")] fn cpu_asinf(_: f32) -> f32;
5 | #[import(cc = "C", name = "acosf")] fn cpu_acosf(_: f32) -> f32;
6 | #[import(cc = "C", name = "atanf")] fn cpu_atanf(_: f32) -> f32;
7 | #[import(cc = "C", name = "erff")] fn cpu_erff(_: f32) -> f32;
8 | #[import(cc = "C", name = "fmodf")] fn cpu_fmodf(_: f32, _: f32) -> f32;
9 | #[import(cc = "C", name = "atan2f")] fn cpu_atan2f(_: f32, _: f32) -> f32;
10 | #[import(cc = "C", name = "anydsl_isinff")] fn cpu_isinff(_: f32) -> i32;
11 | #[import(cc = "C", name = "anydsl_isnanf")] fn cpu_isnanf(_: f32) -> i32;
12 | #[import(cc = "C", name = "anydsl_isfinitef")] fn cpu_isfinitef(_: f32) -> i32;
13 | //#[import(cc = "C", name = "sin")] fn cpu_sin(f64) -> f64;
14 | //#[import(cc = "C", name = "cos")] fn cpu_cos(f64) -> f64;
15 | #[import(cc = "C", name = "tan")] fn cpu_tan(_: f64) -> f64;
16 | #[import(cc = "C", name = "asin")] fn cpu_asin(_: f64) -> f64;
17 | #[import(cc = "C", name = "acos")] fn cpu_acos(_: f64) -> f64;
18 | #[import(cc = "C", name = "atan")] fn cpu_atan(_: f64) -> f64;
19 | #[import(cc = "C", name = "erf")] fn cpu_erf(_: f64) -> f64;
20 | #[import(cc = "C", name = "fmod")] fn cpu_fmod(_: f64, _: f64) -> f64;
21 | #[import(cc = "C", name = "atan2")] fn cpu_atan2(_: f64, _: f64) -> f64;
22 | #[import(cc = "C", name = "anydsl_isinf")] fn cpu_isinf(_: f64) -> i32;
23 | #[import(cc = "C", name = "anydsl_isnan")] fn cpu_isnan(_: f64) -> i32;
24 | #[import(cc = "C", name = "anydsl_isfinite")] fn cpu_isfinite(_: f64) -> i32;
25 |
26 | #[import(cc = "device", name = "llvm.exp.f32")] fn cpu_expf(_: f32) -> f32;
27 | #[import(cc = "device", name = "llvm.exp2.f32")] fn cpu_exp2f(_: f32) -> f32;
28 | #[import(cc = "device", name = "llvm.log.f32")] fn cpu_logf(_: f32) -> f32;
29 | #[import(cc = "device", name = "llvm.log2.f32")] fn cpu_log2f(_: f32) -> f32;
30 | #[import(cc = "device", name = "llvm.pow.f32")] fn cpu_powf(_: f32, _: f32) -> f32;
31 | #[import(cc = "device", name = "llvm.sqrt.f32")] fn cpu_sqrtf(_: f32) -> f32;
32 | #[import(cc = "device", name = "llvm.fabs.f32")] fn cpu_fabsf(_: f32) -> f32;
33 | #[import(cc = "device", name = "llvm.sin.f32")] fn cpu_sinf(_: f32) -> f32;
34 | #[import(cc = "device", name = "llvm.cos.f32")] fn cpu_cosf(_: f32) -> f32;
35 | #[import(cc = "device", name = "llvm.floor.f32")] fn cpu_floorf(_: f32) -> f32;
36 | #[import(cc = "device", name = "llvm.fma.f32")] fn cpu_fmaf(_: f32, _: f32, _: f32) -> f32;
37 | #[import(cc = "device", name = "llvm.fmuladd.f32")] fn cpu_madf(_: f32, _: f32, _: f32) -> f32;
38 | #[import(cc = "device", name = "llvm.copysign.f32")] fn cpu_copysignf(_: f32, _: f32) -> f32;
39 | #[import(cc = "device", name = "llvm.minnum.f32")] fn cpu_fminf(_: f32, _: f32) -> f32;
40 | #[import(cc = "device", name = "llvm.maxnum.f32")] fn cpu_fmaxf(_: f32, _: f32) -> f32;
41 | #[import(cc = "device", name = "llvm.exp.f64")] fn cpu_exp(_: f64) -> f64;
42 | #[import(cc = "device", name = "llvm.exp2.f64")] fn cpu_exp2(_: f64) -> f64;
43 | #[import(cc = "device", name = "llvm.log.f64")] fn cpu_log(_: f64) -> f64;
44 | #[import(cc = "device", name = "llvm.log2.f64")] fn cpu_log2(_: f64) -> f64;
45 | #[import(cc = "device", name = "llvm.pow.f64")] fn cpu_pow(_: f64, _: f64) -> f64;
46 | #[import(cc = "device", name = "llvm.sqrt.f64")] fn cpu_sqrt(_: f64) -> f64;
47 | #[import(cc = "device", name = "llvm.fabs.f64")] fn cpu_fabs(_: f64) -> f64;
48 | #[import(cc = "device", name = "llvm.sin.f64")] fn cpu_sin(_: f64) -> f64;
49 | #[import(cc = "device", name = "llvm.cos.f64")] fn cpu_cos(_: f64) -> f64;
50 | #[import(cc = "device", name = "llvm.floor.f64")] fn cpu_floor(_: f64) -> f64;
51 | #[import(cc = "device", name = "llvm.fma.f64")] fn cpu_fma(_: f64, _: f64, _: f64) -> f64;
52 | #[import(cc = "device", name = "llvm.fmuladd.f64")] fn cpu_mad(_: f64, _: f64, _: f64) -> f64;
53 | #[import(cc = "device", name = "llvm.copysign.f64")] fn cpu_copysign(_: f64, _: f64) -> f64;
54 | #[import(cc = "device", name = "llvm.minnum.f64")] fn cpu_fmin(_: f64, _: f64) -> f64;
55 | #[import(cc = "device", name = "llvm.maxnum.f64")] fn cpu_fmax(_: f64, _: f64) -> f64;
56 | #[import(cc = "device", name = "llvm.ctpop.i32")] fn cpu_popcount32(_: i32) -> i32;
57 | #[import(cc = "device", name = "llvm.ctpop.i64")] fn cpu_popcount64(_: i64) -> i64;
58 | #[import(cc = "device", name = "llvm.ctlz.i32")] fn cpu_clz32(_: i32, _: bool) -> i32;
59 | #[import(cc = "device", name = "llvm.ctlz.i64")] fn cpu_clz64(_: i64, _: bool) -> i64;
60 | #[import(cc = "device", name = "llvm.cttz.i32")] fn cpu_ctz32(_: i32, _: bool) -> i32;
61 | #[import(cc = "device", name = "llvm.cttz.i64")] fn cpu_ctz64(_: i64, _: bool) -> i64;
62 | #[import(cc = "device", name = "llvm.x86.bmi.pext.32")] fn cpu_pext32(_: i32, _: i32) -> i32;
63 | #[import(cc = "device", name = "llvm.x86.bmi.pext.64")] fn cpu_pext64(_: i64, _: i64) -> i64;
64 | #[import(cc = "device", name = "llvm.x86.bmi.pdep.32")] fn cpu_pdep32(_: i32, _: i32) -> i32;
65 | #[import(cc = "device", name = "llvm.x86.bmi.pdep.64")] fn cpu_pdep64(_: i64, _: i64) -> i64;
66 | #[import(cc = "device", name = "llvm.prefetch.p0")] fn cpu_prefetch(&u8, i32, i32, i32) -> ();
67 |
68 | //
69 | // atomics
70 | // 0 1 2 3 4 5 6 7 8 9 10 11 12
71 | // operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
72 | // 0 1 2 4 5 6 7
73 | // ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
74 | // syncscope: singlethread "" (system)
75 | //
76 |
77 | fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 = atomic[i32](0, a, b, 7, "");
78 | fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32 = atomic[i32](1, a, b, 7, "");
79 | fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32 = atomic[i32](2, a, b, 7, "");
80 | fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32 = atomic[i32](7, a, b, 7, "");
81 | fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32 = atomic[i32](8, a, b, 7, "");
82 |
83 | static cpu_intrinsics = Intrinsics {
84 | expf = cpu_expf,
85 | exp2f = cpu_exp2f,
86 | logf = cpu_logf,
87 | log2f = cpu_log2f,
88 | powf = cpu_powf,
89 | rsqrtf = @|a| { 1:f32 / cpu_sqrtf(a) },
90 | sqrtf = cpu_sqrtf,
91 | fabsf = cpu_fabsf,
92 | sinf = cpu_sinf,
93 | cosf = cpu_cosf,
94 | tanf = cpu_tanf,
95 | asinf = cpu_asinf,
96 | acosf = cpu_acosf,
97 | atanf = cpu_atanf,
98 | erff = cpu_erff,
99 | atan2f = cpu_atan2f,
100 | copysignf = cpu_copysignf,
101 | fmaf = cpu_fmaf,
102 | fmaxf = cpu_fmaxf,
103 | fminf = cpu_fminf,
104 | fmodf = cpu_fmodf,
105 | floorf = cpu_floorf,
106 | isinff = cpu_isinff,
107 | isnanf = cpu_isnanf,
108 | isfinitef = cpu_isfinitef,
109 | exp = cpu_exp,
110 | exp2 = cpu_exp2,
111 | log = cpu_log,
112 | log2 = cpu_log2,
113 | pow = cpu_pow,
114 | rsqrt = @|a| { 1.0 / cpu_sqrt(a) },
115 | sqrt = cpu_sqrt,
116 | fabs = cpu_fabs,
117 | sin = cpu_sin,
118 | cos = cpu_cos,
119 | tan = cpu_tan,
120 | asin = cpu_asin,
121 | acos = cpu_acos,
122 | atan = cpu_atan,
123 | erf = cpu_erf,
124 | atan2 = cpu_atan2,
125 | copysign = cpu_copysign,
126 | fma = cpu_fma,
127 | fmax = cpu_fmax,
128 | fmin = cpu_fmin,
129 | fmod = cpu_fmod,
130 | floor = cpu_floor,
131 | isinf = cpu_isinf,
132 | isnan = cpu_isnan,
133 | isfinite = cpu_isfinite,
134 | min = @|a, b| { if a < b { a } else { b } },
135 | max = @|a, b| { if a > b { a } else { b } },
136 | };
137 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_hls.impala:
--------------------------------------------------------------------------------
1 | // no declarations are emitted for "device" functions
2 | #[import(cc = "C", name = "exp")] fn hls_expf(f32) -> f32;
3 | #[import(cc = "C", name = "exp2")] fn hls_exp2f(f32) -> f32;
4 | #[import(cc = "C", name = "log")] fn hls_logf(f32) -> f32;
5 | #[import(cc = "C", name = "log2")] fn hls_log2f(f32) -> f32;
6 | #[import(cc = "C", name = "pow")] fn hls_powf(f32, f32) -> f32;
7 | #[import(cc = "C", name = "rsqrt")] fn hls_rsqrtf(f32) -> f32;
8 | #[import(cc = "C", name = "sqrt")] fn hls_sqrtf(f32) -> f32;
9 | #[import(cc = "C", name = "fabs")] fn hls_fabsf(f32) -> f32;
10 | #[import(cc = "C", name = "sin")] fn hls_sinf(f32) -> f32;
11 | #[import(cc = "C", name = "cos")] fn hls_cosf(f32) -> f32;
12 | #[import(cc = "C", name = "tan")] fn hls_tanf(f32) -> f32;
13 | #[import(cc = "C", name = "asin")] fn hls_asinf(f32) -> f32;
14 | #[import(cc = "C", name = "acos")] fn hls_acosf(f32) -> f32;
15 | #[import(cc = "C", name = "atan")] fn hls_atanf(f32) -> f32;
16 | #[import(cc = "C", name = "erf")] fn hls_erff(f32) -> f32;
17 | #[import(cc = "C", name = "atan2")] fn hls_atan2f(f32, f32) -> f32;
18 | #[import(cc = "C", name = "fmod")] fn hls_fmodf(f32, f32) -> f32;
19 | #[import(cc = "C", name = "floor")] fn hls_floorf(f32) -> f32;
20 | #[import(cc = "C", name = "isinf")] fn hls_isinff(f32) -> i32;
21 | #[import(cc = "C", name = "isnan")] fn hls_isnanf(f32) -> i32;
22 | #[import(cc = "C", name = "isfinite")] fn hls_isfinitef(f32) -> i32;
23 | #[import(cc = "C", name = "fma")] fn hls_fmaf(f32, f32, f32) -> f32;
24 | #[import(cc = "C", name = "mad")] fn hls_madf(f32, f32, f32) -> f32;
25 | #[import(cc = "C", name = "copysign")] fn hls_copysignf(f32, f32) -> f32;
26 | #[import(cc = "C", name = "exp")] fn hls_exp(f64) -> f64;
27 | #[import(cc = "C", name = "exp2")] fn hls_exp2(f64) -> f64;
28 | #[import(cc = "C", name = "log")] fn hls_log(f64) -> f64;
29 | #[import(cc = "C", name = "log2")] fn hls_log2(f64) -> f64;
30 | #[import(cc = "C", name = "pow")] fn hls_pow(f64, f64) -> f64;
31 | #[import(cc = "C", name = "rsqrt")] fn hls_rsqrt(f64) -> f64;
32 | #[import(cc = "C", name = "sqrt")] fn hls_sqrt(f64) -> f64;
33 | #[import(cc = "C", name = "fabs")] fn hls_fabs(f64) -> f64;
34 | #[import(cc = "C", name = "sin")] fn hls_sin(f64) -> f64;
35 | #[import(cc = "C", name = "cos")] fn hls_cos(f64) -> f64;
36 | #[import(cc = "C", name = "tan")] fn hls_tan(f64) -> f64;
37 | #[import(cc = "C", name = "asin")] fn hls_asin(f64) -> f64;
38 | #[import(cc = "C", name = "acos")] fn hls_acos(f64) -> f64;
39 | #[import(cc = "C", name = "atan")] fn hls_atan(f64) -> f64;
40 | #[import(cc = "C", name = "erf")] fn hls_erf(f64) -> f64;
41 | #[import(cc = "C", name = "atan2")] fn hls_atan2(f64, f64) -> f64;
42 | #[import(cc = "C", name = "fmod")] fn hls_fmod(f64, f64) -> f64;
43 | #[import(cc = "C", name = "floor")] fn hls_floor(f64) -> f64;
44 | #[import(cc = "C", name = "isinf")] fn hls_isinf(f64) -> i32;
45 | #[import(cc = "C", name = "isnan")] fn hls_isnan(f64) -> i32;
46 | #[import(cc = "C", name = "isfinite")] fn hls_isfinite(f64) -> i32;
47 | #[import(cc = "C", name = "fma")] fn hls_fma(f64, f64, f64) -> f64;
48 | #[import(cc = "C", name = "mad")] fn hls_mad(f64, f64, f64) -> f64;
49 | #[import(cc = "C", name = "copysign")] fn hls_copysign(f64, f64) -> f64;
50 | #[import(cc = "C", name = "fmin")] fn hls_fminf(f32, f32) -> f32;
51 | #[import(cc = "C", name = "fmax")] fn hls_fmaxf(f32, f32) -> f32;
52 | #[import(cc = "C", name = "fmin")] fn hls_fmin(f64, f64) -> f64;
53 | #[import(cc = "C", name = "fmax")] fn hls_fmax(f64, f64) -> f64;
54 | #[import(cc = "C", name = "min")] fn hls_min(i32, i32) -> i32;
55 | #[import(cc = "C", name = "max")] fn hls_max(i32, i32) -> i32;
56 |
57 | #[import(cc = "device")] fn print_pragma(&[u8]) -> ();
58 |
59 | // channel scalar types
60 | struct channel[T] { data : T }
61 | // channel array types
62 | struct channel1[T] { data : [T * 1 ] }
63 | struct channel2[T] { data : [T * 2 ] }
64 | struct channel4[T] { data : [T * 4 ] }
65 | struct channel8[T] { data : [T * 8 ] }
66 | struct channel16[T] { data : [T * 16 ] }
67 | struct channel32[T] { data : [T * 32 ] }
68 | struct channel64[T] { data : [T * 64 ] }
69 | struct channel128[T] { data : [T * 128] }
70 |
71 | // read and write on scalar channels
72 | #[import(cc = "device", name = "read_channel")] fn read_channel[T] (&mut channel[T]) -> T;
73 | #[import(cc = "device", name = "write_channel")] fn write_channel[T] (&mut channel[T], T ) -> ();
74 |
75 | // read and write on array channels
76 | #[import(cc = "device", name = "read_channel")] fn read_channel1[T] ( &mut channel1[T] ) -> [T * 1 ];
77 | #[import(cc = "device", name = "read_channel")] fn read_channel2[T] ( &mut channel2[T] ) -> [T * 2 ];
78 | #[import(cc = "device", name = "read_channel")] fn read_channel4[T] ( &mut channel4[T] ) -> [T * 4 ];
79 | #[import(cc = "device", name = "read_channel")] fn read_channel8[T] ( &mut channel8[T] ) -> [T * 8 ];
80 | #[import(cc = "device", name = "read_channel")] fn read_channel16[T]( &mut channel16[T]) -> [T * 16];
81 | #[import(cc = "device", name = "read_channel")] fn read_channel32[T]( &mut channel32[T]) -> [T * 32];
82 |
83 | #[import(cc = "device", name = "write_channel")] fn write_channel1[T] ( &mut channel1[T], [T * 1 ]) -> ();
84 | #[import(cc = "device", name = "write_channel")] fn write_channel2[T] ( &mut channel2[T], [T * 2 ]) -> ();
85 | #[import(cc = "device", name = "write_channel")] fn write_channel4[T] ( &mut channel4[T], [T * 4 ]) -> ();
86 | #[import(cc = "device", name = "write_channel")] fn write_channel8[T] ( &mut channel8[T], [T * 8 ]) -> ();
87 | #[import(cc = "device", name = "write_channel")] fn write_channel16[T]( &mut channel16[T], [T * 16]) -> ();
88 | #[import(cc = "device", name = "write_channel")] fn write_channel32[T]( &mut channel32[T], [T * 32]) -> ();
89 | #[import(cc = "device", name = " ")] fn bitcast_channel[T]( &mut channel1[T]) -> [T * 2];
90 |
91 | fn @hls_accelerator(dev: i32) = Accelerator {
92 | exec = @|body| |_grid, _block| {
93 | let work_item = WorkItem {
94 | tidx = @|| 0, tidy = @|| 0, tidz = @|| 0,
95 | bidx = @|| 0, bidy = @|| 0, bidz = @|| 0,
96 | gidx = @|| 0, gidy = @|| 0, gidz = @|| 0,
97 | bdimx = @|| 1, bdimy = @|| 1, bdimz = @|| 1,
98 | gdimx = @|| 1, gdimy = @|| 1, gdimz = @|| 1,
99 | nblkx = @|| 1, nblky = @|| 1, nblkz = @|| 1
100 | };
101 | hls(dev, || @body(work_item));
102 | },
103 | sync = @|| synchronize_hls(dev),
104 | alloc = @|size| alloc_hls(dev, size),
105 | alloc_unified = @|size| alloc_hls_unified(dev, size),
106 | barrier = @|| ()
107 | };
108 |
109 | static hls_intrinsics = Intrinsics {
110 | expf = hls_expf,
111 | exp2f = hls_exp2f,
112 | logf = hls_logf,
113 | log2f = hls_log2f,
114 | powf = hls_powf,
115 | rsqrtf = hls_rsqrtf,
116 | sqrtf = hls_sqrtf,
117 | fabsf = hls_fabsf,
118 | sinf = hls_sinf,
119 | cosf = hls_cosf,
120 | tanf = hls_tanf,
121 | asinf = hls_asinf,
122 | acosf = hls_acosf,
123 | atanf = hls_atanf,
124 | erff = hls_erff,
125 | atan2f = hls_atan2f,
126 | copysignf = hls_copysignf,
127 | fmaf = hls_fmaf,
128 | fmaxf = hls_fmaxf,
129 | fminf = hls_fminf,
130 | fmodf = hls_fmodf,
131 | floorf = hls_floorf,
132 | isinff = hls_isinff,
133 | isnanf = hls_isnanf,
134 | isfinitef = hls_isfinitef,
135 | exp = hls_exp,
136 | exp2 = hls_exp2,
137 | log = hls_log,
138 | log2 = hls_log2,
139 | pow = hls_pow,
140 | rsqrt = hls_rsqrt,
141 | sqrt = hls_sqrt,
142 | fabs = hls_fabs,
143 | sin = hls_sin,
144 | cos = hls_cos,
145 | tan = hls_tan,
146 | asin = hls_asin,
147 | acos = hls_acos,
148 | atan = hls_atan,
149 | erf = hls_erf,
150 | atan2 = hls_atan2,
151 | copysign = hls_copysign,
152 | fma = hls_fma,
153 | fmax = hls_fmax,
154 | fmin = hls_fmin,
155 | fmod = hls_fmod,
156 | floor = hls_floor,
157 | isinf = hls_isinf,
158 | isnan = hls_isnan,
159 | isfinite = hls_isfinite,
160 | min = hls_min,
161 | max = hls_max,
162 | };
163 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_levelzero.impala:
--------------------------------------------------------------------------------
1 | // most device intrinsics are the same as OpenCL and don't need to be duplicated
2 | fn spv_levelzero_get_num_groups() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */);
3 | fn spv_levelzero_get_local_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */);
4 | fn spv_levelzero_get_group_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */);
5 | fn spv_levelzero_get_local_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */);
6 | fn spv_levelzero_get_global_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */);
7 | fn spv_levelzero_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */);
8 |
9 | fn @levelzero_accelerator(dev: i32) = Accelerator {
10 | exec = @|body| |grid, block| {
11 | let work_item = WorkItem {
12 | tidx = @|| spv_levelzero_get_local_id()(0) as i32,
13 | tidy = @|| spv_levelzero_get_local_id()(1) as i32,
14 | tidz = @|| spv_levelzero_get_local_id()(2) as i32,
15 | bidx = @|| spv_levelzero_get_group_id()(0) as i32,
16 | bidy = @|| spv_levelzero_get_group_id()(1) as i32,
17 | bidz = @|| spv_levelzero_get_group_id()(2) as i32,
18 | gidx = @|| spv_levelzero_get_global_id()(0) as i32,
19 | gidy = @|| spv_levelzero_get_global_id()(1) as i32,
20 | gidz = @|| spv_levelzero_get_global_id()(2) as i32,
21 | bdimx = @|| spv_levelzero_get_local_size()(0) as i32,
22 | bdimy = @|| spv_levelzero_get_local_size()(1) as i32,
23 | bdimz = @|| spv_levelzero_get_local_size()(2) as i32,
24 | gdimx = @|| spv_levelzero_get_global_size()(0) as i32,
25 | gdimy = @|| spv_levelzero_get_global_size()(1) as i32,
26 | gdimz = @|| spv_levelzero_get_global_size()(2) as i32,
27 | nblkx = @|| spv_levelzero_get_num_groups()(0) as i32,
28 | nblky = @|| spv_levelzero_get_num_groups()(1) as i32,
29 | nblkz = @|| spv_levelzero_get_num_groups()(2) as i32
30 | };
31 | levelzero(dev, grid, block, || @body(work_item))
32 | },
33 | sync = @|| synchronize_levelzero(dev),
34 | alloc = @|size| alloc_levelzero(dev, size),
35 | alloc_unified = @|size| alloc_levelzero_unified(dev, size),
36 | barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
37 | };
38 |
39 | static levelzero_intrinsics = Intrinsics {
40 | expf = opencl_expf,
41 | exp2f = opencl_exp2f,
42 | logf = opencl_logf,
43 | log2f = opencl_log2f,
44 | powf = opencl_powf,
45 | rsqrtf = opencl_rsqrtf,
46 | sqrtf = opencl_sqrtf,
47 | fabsf = opencl_fabsf,
48 | sinf = opencl_sinf,
49 | cosf = opencl_cosf,
50 | tanf = opencl_tanf,
51 | asinf = opencl_asinf,
52 | acosf = opencl_acosf,
53 | atanf = opencl_atanf,
54 | erff = opencl_erff,
55 | atan2f = opencl_atan2f,
56 | copysignf = opencl_copysignf,
57 | fmaf = opencl_fmaf,
58 | fmaxf = opencl_fmaxf,
59 | fminf = opencl_fminf,
60 | fmodf = opencl_fmodf,
61 | floorf = opencl_floorf,
62 | isinff = opencl_isinff,
63 | isnanf = opencl_isnanf,
64 | isfinitef = opencl_isfinitef,
65 | exp = opencl_exp,
66 | exp2 = opencl_exp2,
67 | log = opencl_log,
68 | log2 = opencl_log2,
69 | pow = opencl_pow,
70 | rsqrt = opencl_rsqrt,
71 | sqrt = opencl_sqrt,
72 | fabs = opencl_fabs,
73 | sin = opencl_sin,
74 | cos = opencl_cos,
75 | tan = opencl_tan,
76 | asin = opencl_asin,
77 | acos = opencl_acos,
78 | atan = opencl_atan,
79 | erf = opencl_erf,
80 | atan2 = opencl_atan2,
81 | copysign = opencl_copysign,
82 | fma = opencl_fma,
83 | fmax = opencl_fmax,
84 | fmin = opencl_fmin,
85 | fmod = opencl_fmod,
86 | floor = opencl_floor,
87 | isinf = opencl_isinf,
88 | isnan = opencl_isnan,
89 | isfinite = opencl_isfinite,
90 | min = opencl_min,
91 | max = opencl_max,
92 | };
93 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_math.impala:
--------------------------------------------------------------------------------
1 | mod math_builtins {
2 |
3 | #[import(cc = "builtin")] fn fabs[T](T) -> T;
4 | #[import(cc = "builtin")] fn copysign[T](T, T) -> T;
5 | #[import(cc = "builtin")] fn signbit[T](T) -> bool;
6 | #[import(cc = "builtin")] fn round[T](T) -> T;
7 | #[import(cc = "builtin")] fn ceil[T](T) -> T;
8 | #[import(cc = "builtin")] fn floor[T](T) -> T;
9 | #[import(cc = "builtin")] fn fmin[T](T, T) -> T;
10 | #[import(cc = "builtin")] fn fmax[T](T, T) -> T;
11 | #[import(cc = "builtin")] fn cos[T](T) -> T;
12 | #[import(cc = "builtin")] fn sin[T](T) -> T;
13 | #[import(cc = "builtin")] fn tan[T](T) -> T;
14 | #[import(cc = "builtin")] fn acos[T](T) -> T;
15 | #[import(cc = "builtin")] fn asin[T](T) -> T;
16 | #[import(cc = "builtin")] fn atan[T](T) -> T;
17 | #[import(cc = "builtin")] fn atan2[T](T, T) -> T;
18 | #[import(cc = "builtin")] fn sqrt[T](T) -> T;
19 | #[import(cc = "builtin")] fn cbrt[T](T) -> T;
20 | #[import(cc = "builtin")] fn pow[T](T, T) -> T;
21 | #[import(cc = "builtin")] fn exp[T](T) -> T;
22 | #[import(cc = "builtin")] fn exp2[T](T) -> T;
23 | #[import(cc = "builtin")] fn log[T](T) -> T;
24 | #[import(cc = "builtin")] fn log2[T](T) -> T;
25 | #[import(cc = "builtin")] fn log10[T](T) -> T;
26 | #[import(cc = "builtin")] fn isnan[T](T) -> bool;
27 | #[import(cc = "builtin")] fn isfinite[T](T) -> bool;
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_opencl.impala:
--------------------------------------------------------------------------------
1 | // no declarations are emitted for "device" functions
2 | #[import(cc = "device", name = "barrier")] fn opencl_barrier(u32) -> ();
3 | #[import(cc = "device", name = "exp")] fn opencl_expf(f32) -> f32;
4 | #[import(cc = "device", name = "exp2")] fn opencl_exp2f(f32) -> f32;
5 | #[import(cc = "device", name = "log")] fn opencl_logf(f32) -> f32;
6 | #[import(cc = "device", name = "log2")] fn opencl_log2f(f32) -> f32;
7 | #[import(cc = "device", name = "pow")] fn opencl_powf(f32, f32) -> f32;
8 | #[import(cc = "device", name = "rsqrt")] fn opencl_rsqrtf(f32) -> f32;
9 | #[import(cc = "device", name = "sqrt")] fn opencl_sqrtf(f32) -> f32;
10 | #[import(cc = "device", name = "fabs")] fn opencl_fabsf(f32) -> f32;
11 | #[import(cc = "device", name = "sin")] fn opencl_sinf(f32) -> f32;
12 | #[import(cc = "device", name = "cos")] fn opencl_cosf(f32) -> f32;
13 | #[import(cc = "device", name = "tan")] fn opencl_tanf(f32) -> f32;
14 | #[import(cc = "device", name = "asin")] fn opencl_asinf(f32) -> f32;
15 | #[import(cc = "device", name = "acos")] fn opencl_acosf(f32) -> f32;
16 | #[import(cc = "device", name = "atan")] fn opencl_atanf(f32) -> f32;
17 | #[import(cc = "device", name = "erf")] fn opencl_erff(f32) -> f32;
18 | #[import(cc = "device", name = "atan2")] fn opencl_atan2f(f32, f32) -> f32;
19 | #[import(cc = "device", name = "fmod")] fn opencl_fmodf(f32, f32) -> f32;
20 | #[import(cc = "device", name = "floor")] fn opencl_floorf(f32) -> f32;
21 | #[import(cc = "device", name = "isinf")] fn opencl_isinff(f32) -> i32;
22 | #[import(cc = "device", name = "isnan")] fn opencl_isnanf(f32) -> i32;
23 | #[import(cc = "device", name = "isfinite")] fn opencl_isfinitef(f32) -> i32;
24 | #[import(cc = "device", name = "fma")] fn opencl_fmaf(f32, f32, f32) -> f32;
25 | #[import(cc = "device", name = "mad")] fn opencl_madf(f32, f32, f32) -> f32;
26 | #[import(cc = "device", name = "copysign")] fn opencl_copysignf(f32, f32) -> f32;
27 | #[import(cc = "device", name = "exp")] fn opencl_exp(f64) -> f64;
28 | #[import(cc = "device", name = "exp2")] fn opencl_exp2(f64) -> f64;
29 | #[import(cc = "device", name = "log")] fn opencl_log(f64) -> f64;
30 | #[import(cc = "device", name = "log2")] fn opencl_log2(f64) -> f64;
31 | #[import(cc = "device", name = "pow")] fn opencl_pow(f64, f64) -> f64;
32 | #[import(cc = "device", name = "rsqrt")] fn opencl_rsqrt(f64) -> f64;
33 | #[import(cc = "device", name = "sqrt")] fn opencl_sqrt(f64) -> f64;
34 | #[import(cc = "device", name = "fabs")] fn opencl_fabs(f64) -> f64;
35 | #[import(cc = "device", name = "sin")] fn opencl_sin(f64) -> f64;
36 | #[import(cc = "device", name = "cos")] fn opencl_cos(f64) -> f64;
37 | #[import(cc = "device", name = "tan")] fn opencl_tan(f64) -> f64;
38 | #[import(cc = "device", name = "asin")] fn opencl_asin(f64) -> f64;
39 | #[import(cc = "device", name = "acos")] fn opencl_acos(f64) -> f64;
40 | #[import(cc = "device", name = "atan")] fn opencl_atan(f64) -> f64;
41 | #[import(cc = "device", name = "erf")] fn opencl_erf(f64) -> f64;
42 | #[import(cc = "device", name = "atan2")] fn opencl_atan2(f64, f64) -> f64;
43 | #[import(cc = "device", name = "fmod")] fn opencl_fmod(f64, f64) -> f64;
44 | #[import(cc = "device", name = "floor")] fn opencl_floor(f64) -> f64;
45 | #[import(cc = "device", name = "isinf")] fn opencl_isinf(f64) -> i32;
46 | #[import(cc = "device", name = "isnan")] fn opencl_isnan(f64) -> i32;
47 | #[import(cc = "device", name = "isfinite")] fn opencl_isfinite(f64) -> i32;
48 | #[import(cc = "device", name = "fma")] fn opencl_fma(f64, f64, f64) -> f64;
49 | #[import(cc = "device", name = "mad")] fn opencl_mad(f64, f64, f64) -> f64;
50 | #[import(cc = "device", name = "copysign")] fn opencl_copysign(f64, f64) -> f64;
51 | #[import(cc = "device", name = "fmin")] fn opencl_fminf(f32, f32) -> f32;
52 | #[import(cc = "device", name = "fmax")] fn opencl_fmaxf(f32, f32) -> f32;
53 | #[import(cc = "device", name = "fmin")] fn opencl_fmin(f64, f64) -> f64;
54 | #[import(cc = "device", name = "fmax")] fn opencl_fmax(f64, f64) -> f64;
55 | #[import(cc = "device", name = "min")] fn opencl_min(i32, i32) -> i32;
56 | #[import(cc = "device", name = "max")] fn opencl_max(i32, i32) -> i32;
57 | #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global(&mut addrspace(1)i32, i32) -> i32;
58 | #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global_f32(&mut addrspace(1)f32, f32) -> f32;
59 | #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32;
60 | #[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_global(&mut addrspace(1)i32, i32) -> i32;
61 | #[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32;
62 | #[import(cc = "device", name = "get_work_dim")] fn opencl_get_work_dim() -> u32;
63 | #[import(cc = "device", name = "get_global_size")] fn opencl_get_global_size(u32) -> u64;
64 | #[import(cc = "device", name = "get_global_id")] fn opencl_get_global_id(u32) -> u64;
65 | #[import(cc = "device", name = "get_local_size")] fn opencl_get_local_size(u32) -> u64;
66 | #[import(cc = "device", name = "get_local_id")] fn opencl_get_local_id(u32) -> u64;
67 | #[import(cc = "device", name = "get_num_groups")] fn opencl_get_num_groups(u32) -> u64;
68 | #[import(cc = "device", name = "get_group_id")] fn opencl_get_group_id(u32) -> u64;
69 | #[import(cc = "device", name = "get_global_offset")] fn opencl_get_global_offset(u32) -> u64;
70 |
71 | static CLK_LOCAL_MEM_FENCE = 1:u32;
72 | static CLK_GLOBAL_MEM_FENCE = 2:u32;
73 |
74 | fn @opencl_accelerator(dev: i32) = Accelerator {
75 | exec = @|body| |grid, block| {
76 | let work_item = WorkItem {
77 | tidx = @|| opencl_get_local_id(0) as i32,
78 | tidy = @|| opencl_get_local_id(1) as i32,
79 | tidz = @|| opencl_get_local_id(2) as i32,
80 | bidx = @|| opencl_get_group_id(0) as i32,
81 | bidy = @|| opencl_get_group_id(1) as i32,
82 | bidz = @|| opencl_get_group_id(2) as i32,
83 | gidx = @|| opencl_get_global_id(0) as i32,
84 | gidy = @|| opencl_get_global_id(1) as i32,
85 | gidz = @|| opencl_get_global_id(2) as i32,
86 | bdimx = @|| opencl_get_local_size(0) as i32,
87 | bdimy = @|| opencl_get_local_size(1) as i32,
88 | bdimz = @|| opencl_get_local_size(2) as i32,
89 | gdimx = @|| opencl_get_global_size(0) as i32,
90 | gdimy = @|| opencl_get_global_size(1) as i32,
91 | gdimz = @|| opencl_get_global_size(2) as i32,
92 | nblkx = @|| opencl_get_num_groups(0) as i32,
93 | nblky = @|| opencl_get_num_groups(1) as i32,
94 | nblkz = @|| opencl_get_num_groups(2) as i32
95 | };
96 | opencl(dev, grid, block, || @body(work_item))
97 | },
98 | sync = @|| synchronize_opencl(dev),
99 | alloc = @|size| alloc_opencl(dev, size),
100 | alloc_unified = @|size| alloc_opencl_unified(dev, size),
101 | barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
102 | };
103 |
104 | fn spv_cl_get_num_groups() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */);
105 | fn spv_cl_get_local_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */);
106 | fn spv_cl_get_group_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */);
107 | fn spv_cl_get_local_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */);
108 | fn spv_cl_get_global_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */);
109 | fn spv_cl_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */);
110 |
111 | fn @opencl_spirv_accelerator(dev: i32) = Accelerator {
112 | exec = @|body| |grid, block| {
113 | let work_item = WorkItem {
114 | tidx = @|| spv_cl_get_local_id()(0) as i32,
115 | tidy = @|| spv_cl_get_local_id()(1) as i32,
116 | tidz = @|| spv_cl_get_local_id()(2) as i32,
117 | bidx = @|| spv_cl_get_group_id()(0) as i32,
118 | bidy = @|| spv_cl_get_group_id()(1) as i32,
119 | bidz = @|| spv_cl_get_group_id()(2) as i32,
120 | gidx = @|| spv_cl_get_global_id()(0) as i32,
121 | gidy = @|| spv_cl_get_global_id()(1) as i32,
122 | gidz = @|| spv_cl_get_global_id()(2) as i32,
123 | bdimx = @|| spv_cl_get_local_size()(0) as i32,
124 | bdimy = @|| spv_cl_get_local_size()(1) as i32,
125 | bdimz = @|| spv_cl_get_local_size()(2) as i32,
126 | gdimx = @|| spv_cl_get_global_size()(0) as i32,
127 | gdimy = @|| spv_cl_get_global_size()(1) as i32,
128 | gdimz = @|| spv_cl_get_global_size()(2) as i32,
129 | nblkx = @|| spv_cl_get_num_groups()(0) as i32,
130 | nblky = @|| spv_cl_get_num_groups()(1) as i32,
131 | nblkz = @|| spv_cl_get_num_groups()(2) as i32
132 | };
133 | opencl_spirv(dev, grid, block, || @body(work_item))
134 | },
135 | sync = @|| synchronize_opencl(dev),
136 | alloc = @|size| alloc_opencl(dev, size),
137 | alloc_unified = @|size| alloc_opencl_unified(dev, size),
138 | barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
139 | };
140 |
141 | static opencl_intrinsics = Intrinsics {
142 | expf = opencl_expf,
143 | exp2f = opencl_exp2f,
144 | logf = opencl_logf,
145 | log2f = opencl_log2f,
146 | powf = opencl_powf,
147 | rsqrtf = opencl_rsqrtf,
148 | sqrtf = opencl_sqrtf,
149 | fabsf = opencl_fabsf,
150 | sinf = opencl_sinf,
151 | cosf = opencl_cosf,
152 | tanf = opencl_tanf,
153 | asinf = opencl_asinf,
154 | acosf = opencl_acosf,
155 | atanf = opencl_atanf,
156 | erff = opencl_erff,
157 | atan2f = opencl_atan2f,
158 | copysignf = opencl_copysignf,
159 | fmaf = opencl_fmaf,
160 | fmaxf = opencl_fmaxf,
161 | fminf = opencl_fminf,
162 | fmodf = opencl_fmodf,
163 | floorf = opencl_floorf,
164 | isinff = opencl_isinff,
165 | isnanf = opencl_isnanf,
166 | isfinitef = opencl_isfinitef,
167 | exp = opencl_exp,
168 | exp2 = opencl_exp2,
169 | log = opencl_log,
170 | log2 = opencl_log2,
171 | pow = opencl_pow,
172 | rsqrt = opencl_rsqrt,
173 | sqrt = opencl_sqrt,
174 | fabs = opencl_fabs,
175 | sin = opencl_sin,
176 | cos = opencl_cos,
177 | tan = opencl_tan,
178 | asin = opencl_asin,
179 | acos = opencl_acos,
180 | atan = opencl_atan,
181 | erf = opencl_erf,
182 | atan2 = opencl_atan2,
183 | copysign = opencl_copysign,
184 | fma = opencl_fma,
185 | fmax = opencl_fmax,
186 | fmin = opencl_fmin,
187 | fmod = opencl_fmod,
188 | floor = opencl_floor,
189 | isinf = opencl_isinf,
190 | isnan = opencl_isnan,
191 | isfinite = opencl_isfinite,
192 | min = opencl_min,
193 | max = opencl_max,
194 | };
195 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_rv.impala:
--------------------------------------------------------------------------------
1 | #[import(cc = "C")] fn rv_mask() -> bool;
2 | #[import(cc = "C")] fn rv_any(_: bool) -> bool;
3 | #[import(cc = "C")] fn rv_all(_: bool) -> bool;
4 | #[import(cc = "C")] fn rv_ballot(_: bool) -> i32;
5 | #[import(cc = "C")] fn rv_extract(_: f32, _: i32) -> f32;
6 | #[import(cc = "C")] fn rv_insert(_: f32, _: i32, _: f32) -> f32;
7 | #[import(cc = "C")] fn rv_load(_: &f32, _: i32) -> f32;
8 | #[import(cc = "C")] fn rv_store(_: &mut f32, _: i32, _: f32) -> ();
9 | #[import(cc = "C")] fn rv_shuffle(_: f32, _: i32) -> f32;
10 | #[import(cc = "C")] fn rv_align(_: &i8, _: i32)-> &i8;
11 | #[import(cc = "C")] fn rv_compact(_: f32, _: bool) -> f32;
12 | #[import(cc = "C")] fn rv_lane_id() -> i32;
13 | #[import(cc = "C")] fn rv_num_lanes() -> i32;
14 |
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_spirv.impala:
--------------------------------------------------------------------------------
1 | #[import(cc = "device", name = "spirv.builtin")] fn spirv_get_builtin[T](i32) -> T;
--------------------------------------------------------------------------------
/platforms/artic/intrinsics_thorin.impala:
--------------------------------------------------------------------------------
1 | #[import(cc = "builtin")] fn undef[T]() -> T;
2 | #[import(cc = "builtin")] fn sizeof[_]() -> i64;
3 | #[import(cc = "builtin")] fn alignof[_]() -> i64;
4 | #[import(cc = "builtin")] fn bitcast[T, U](_src: U) -> T;
5 | #[import(cc = "builtin")] fn select[T, U](_cond: T, _true: U, _false: U) -> U;
6 | #[import(cc = "builtin")] fn insert[T, U](_tuple: T, _index: i32, _value: U) -> T;
7 |
8 | #[import(cc = "thorin")] fn atomic[T](_binop: u32, _addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
9 | #[import(cc = "thorin")] fn atomic_load[T](_addr: &T, _order: u32, _scope: &[u8]) -> T;
10 | #[import(cc = "thorin")] fn atomic_store[T](_addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> ();
11 | #[import(cc = "thorin")] fn cmpxchg[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
12 | #[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
13 | #[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> ();
14 | #[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> ();
15 | #[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
16 | #[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
17 | #[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
18 | #[import(cc = "thorin")] fn opencl_spirv(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
19 | #[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
20 | #[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
21 | #[import(cc = "thorin")] fn levelzero(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
22 | #[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T];
23 | #[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> ();
24 | #[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend
25 | #[import(cc = "thorin", name = "parallel")] fn thorin_parallel(_num_threads: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> ();
26 | #[import(cc = "thorin", name = "spawn")] fn thorin_spawn(_body: fn() -> ()) -> i32;
27 | #[import(cc = "thorin")] fn sync(_id: i32) -> ();
28 | #[import(cc = "thorin")] fn vectorize(_vector_length: i32, _body: fn(i32) -> ()) -> ();
29 |
30 | #[import(cc = "thorin", name = "atomic")] fn atomic_p1[T](_binop: u32, _addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> T;
31 | #[import(cc = "thorin", name = "atomic")] fn atomic_p3[T](_binop: u32, _addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> T;
32 | #[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p1[T](_addr: &addrspace(1)T, _order: u32, _scope: &[u8]) -> T;
33 | #[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p3[T](_addr: &addrspace(3)T, _order: u32, _scope: &[u8]) -> T;
34 | #[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p1[T](_addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> ();
35 | #[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p3[T](_addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> ();
36 | #[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
37 | #[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
38 | #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
39 | #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
40 |
41 | fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body);
42 | fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body);
43 | fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body);
44 |
--------------------------------------------------------------------------------
/platforms/artic/runtime.impala:
--------------------------------------------------------------------------------
1 | #[import(cc = "C", name = "anydsl_info")] fn runtime_info() -> ();
2 | #[import(cc = "C", name = "anydsl_device_name")] fn runtime_device_name(_device: i32) -> &[u8];
3 | #[import(cc = "C", name = "anydsl_device_check_feature_support")] fn runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool;
4 |
5 | #[import(cc = "C", name = "anydsl_alloc")] fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8];
6 | #[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8];
7 | #[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8];
8 | #[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> ();
9 | #[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8];
10 | #[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> ();
11 | #[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> ();
12 | #[import(cc = "C", name = "anydsl_release_host")] fn runtime_release_host(_device: i32, _ptr: &[i8]) -> ();
13 |
14 | #[import(cc = "C", name = "anydsl_random_seed")] fn random_seed(_: u32) -> ();
15 | #[import(cc = "C", name = "anydsl_random_val_f32")] fn random_val_f32() -> f32;
16 | #[import(cc = "C", name = "anydsl_random_val_u64")] fn random_val_u64() -> u64;
17 |
18 | #[import(cc = "C", name = "anydsl_get_micro_time")] fn get_micro_time() -> i64;
19 | #[import(cc = "C", name = "anydsl_get_nano_time")] fn get_nano_time() -> i64;
20 | #[import(cc = "C", name = "anydsl_get_kernel_time")] fn get_kernel_time() -> i64;
21 |
22 | #[import(cc = "C", name = "anydsl_print_i16")] fn print_i16(_: i16) -> ();
23 | #[import(cc = "C", name = "anydsl_print_i32")] fn print_i32(_: i32) -> ();
24 | #[import(cc = "C", name = "anydsl_print_i64")] fn print_i64(_: i64) -> ();
25 | #[import(cc = "C", name = "anydsl_print_u16")] fn print_u16(_: u16) -> ();
26 | #[import(cc = "C", name = "anydsl_print_u32")] fn print_u32(_: u32) -> ();
27 | #[import(cc = "C", name = "anydsl_print_u64")] fn print_u64(_: u64) -> ();
28 | #[import(cc = "C", name = "anydsl_print_f32")] fn print_f32(_: f32) -> ();
29 | #[import(cc = "C", name = "anydsl_print_f64")] fn print_f64(_: f64) -> ();
30 | #[import(cc = "C", name = "anydsl_print_char")] fn print_char(_: u8) -> ();
31 | #[import(cc = "C", name = "anydsl_print_string")] fn print_string(_: &[u8]) -> ();
32 | #[import(cc = "C", name = "anydsl_print_flush")] fn print_flush() -> ();
33 |
34 | // TODO
35 | //struct Buffer[T] {
36 | // data : &mut [T],
37 | // size : i64,
38 | // device : i32
39 | //}
40 | //
41 | //fn @alloc[T](device: i32, size: i64) = Buffer[T] {
42 | // data = runtime_alloc(device, size * sizeof[T]()) as &mut [T],
43 | // size = size,
44 | // device = device
45 | //};
46 | //fn @alloc_host[T](device: i32, size: i64) = Buffer[T] {
47 | // data = runtime_alloc_host(device, size * sizeof[T]()) as &mut [T],
48 | // size = size,
49 | // device = device
50 | //};
51 | //fn @alloc_unified[T](device: i32, size: i64) = Buffer[T] {
52 | // data = runtime_alloc_unified(device, size * sizeof[T]()) as &mut [T],
53 | // size = size,
54 | // device = device
55 | //};
56 | //
57 | //fn @release[T](buf: Buffer[T]) = runtime_release(buf.device, buf.data as &[i8]);
58 | //fn @alloc_cpu[T](size: i64) = alloc[T](0, size);
59 | //fn @alloc_cuda[T](dev: i32, size: i64) = alloc[T](runtime_device(1, dev), size);
60 | //fn @alloc_cuda_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(1, dev), size);
61 | //fn @alloc_cuda_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(1, dev), size);
62 | //fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev));
63 | //fn @alloc_opencl[T](dev: i32, size: i64) = alloc[T](runtime_device(2, dev), size);
64 | //fn @alloc_opencl_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(2, dev), size);
65 | //fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev));
66 | //fn @alloc_hsa[T](dev: i32, size: i64) = alloc[T](runtime_device(3, dev), size);
67 | //fn @alloc_hsa_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(3, dev), size);
68 | //fn @alloc_hsa_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(3, dev), size);
69 | //fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));
70 | //fn @alloc_pal[T](dev: i32, size: i64) = alloc[T](runtime_device(4, dev), size);
71 | //fn @alloc_pal_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(4, dev), size);
72 | //fn @alloc_pal_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(4, dev), size);
73 | //fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev));
74 | //
75 | //fn @copy[T](src: Buffer[T], dst: Buffer[T]) = runtime_copy(src.device, src.data as &[i8], 0, dst.device, dst.data as &mut [i8], 0, src.size);
76 | //fn @copy_offset[T](src: Buffer[T], off_src: i64, dst: Buffer[T], off_dst: i64, size: i64) = runtime_copy(src.device, src.data as &[i8], off_src, dst.device, dst.data as &mut [i8], off_dst, size);
77 |
78 | struct Buffer {
79 | data : &mut [i8],
80 | size : i64,
81 | device : i32
82 | }
83 |
84 | fn @alloc(device: i32, size: i64) = Buffer {
85 | data = runtime_alloc(device, size),
86 | size = size,
87 | device = device
88 | };
89 | fn @alloc_host(device: i32, size: i64) = Buffer {
90 | data = runtime_alloc_host(device, size),
91 | size = size,
92 | device = device
93 | };
94 | fn @alloc_unified(device: i32, size: i64) = Buffer {
95 | data = runtime_alloc_unified(device, size),
96 | size = size,
97 | device = device
98 | };
99 | fn @release(buf: Buffer) = runtime_release(buf.device, buf.data);
100 |
101 | fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
102 |
103 | fn @alloc_cpu(size: i64) = alloc(0, size);
104 | fn @alloc_cuda(dev: i32, size: i64) = alloc(runtime_device(1, dev), size);
105 | fn @alloc_cuda_host(dev: i32, size: i64) = alloc_host(runtime_device(1, dev), size);
106 | fn @alloc_cuda_unified(dev: i32, size: i64) = alloc_unified(runtime_device(1, dev), size);
107 | fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev));
108 | fn @alloc_opencl(dev: i32, size: i64) = alloc(runtime_device(2, dev), size);
109 | fn @alloc_opencl_unified(dev: i32, size: i64) = alloc_unified(runtime_device(2, dev), size);
110 | fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev));
111 | fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
112 | fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
113 | fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
114 | fn @alloc_hsa(dev: i32, size: i64) = alloc(runtime_device(3, dev), size);
115 | fn @alloc_hsa_host(dev: i32, size: i64) = alloc_host(runtime_device(3, dev), size);
116 | fn @alloc_hsa_unified(dev: i32, size: i64) = alloc_unified(runtime_device(3, dev), size);
117 | fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));
118 | fn @alloc_pal(dev: i32, size: i64) = alloc(runtime_device(4, dev), size);
119 | fn @alloc_pal_host(dev: i32, size: i64) = alloc_host(runtime_device(4, dev), size);
120 | fn @alloc_pal_unified(dev: i32, size: i64) = alloc_unified(runtime_device(4, dev), size);
121 | fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev));
122 | fn @alloc_levelzero(dev: i32, size: i64) = alloc(runtime_device(5, dev), size);
123 | fn @alloc_levelzero_host(dev: i32, size: i64) = alloc_host(runtime_device(5, dev), size);
124 | fn @alloc_levelzero_unified(dev: i32, size: i64) = alloc_unified(runtime_device(5, dev), size);
125 | fn @synchronize_levelzero(dev: i32) = runtime_synchronize(runtime_device(5, dev));
126 |
127 | fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size);
128 | fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size);
129 |
130 |
131 | // range, range_step, unroll, unroll_step, etc.
132 | fn @unroll_step(body: fn(i32) -> ()) {
133 | fn @(?beg & ?end & ?step) loop(beg: i32, end: i32, step: i32) -> () {
134 | if beg < end {
135 | @body(beg);
136 | loop(beg + step, end, step)
137 | }
138 | }
139 | loop
140 | }
141 |
142 | fn @unroll_step_rev(body: fn(i32) -> ()) {
143 | fn @(?beg & ?end & ?step) loop(end: i32, beg: i32, step: i32) -> () {
144 | if end > beg {
145 | @body(end);
146 | loop(end - step, beg, step)
147 | }
148 | }
149 | loop
150 | }
151 |
152 | fn @range(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)($lower, $upper, 1);
153 | fn @range_step(body: fn(i32) -> ()) = @|lower: i32, upper: i32, step: i32| unroll_step(body)($lower, $upper, step);
154 | fn @range_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1);
155 |
156 | fn @unroll(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)(lower, upper, 1);
157 | fn @unroll_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1);
158 |
--------------------------------------------------------------------------------
/platforms/impala/intrinsics.impala:
--------------------------------------------------------------------------------
1 | struct WorkItem {
2 | tidx : fn() -> i32,
3 | tidy : fn() -> i32,
4 | tidz : fn() -> i32,
5 | bidx : fn() -> i32,
6 | bidy : fn() -> i32,
7 | bidz : fn() -> i32,
8 | gidx : fn() -> i32,
9 | gidy : fn() -> i32,
10 | gidz : fn() -> i32,
11 | bdimx : fn() -> i32,
12 | bdimy : fn() -> i32,
13 | bdimz : fn() -> i32,
14 | gdimx : fn() -> i32,
15 | gdimy : fn() -> i32,
16 | gdimz : fn() -> i32,
17 | nblkx : fn() -> i32,
18 | nblky : fn() -> i32,
19 | nblkz : fn() -> i32
20 | }
21 |
22 | struct Accelerator {
23 | exec : fn((i32, i32, i32), // grid
24 | (i32, i32, i32), // block
25 | fn(WorkItem) -> ()) -> (),
26 | sync : fn() -> (),
27 | alloc : fn(i64) -> Buffer,
28 | alloc_unified : fn(i64) -> Buffer,
29 | barrier : fn() -> ()
30 | }
31 |
32 | struct Intrinsics {
33 | expf : fn(f32) -> f32,
34 | exp2f : fn(f32) -> f32,
35 | logf : fn(f32) -> f32,
36 | log2f : fn(f32) -> f32,
37 | powf : fn(f32, f32) -> f32,
38 | rsqrtf : fn(f32) -> f32,
39 | sqrtf : fn(f32) -> f32,
40 | fabsf : fn(f32) -> f32,
41 | sinf : fn(f32) -> f32,
42 | cosf : fn(f32) -> f32,
43 | tanf : fn(f32) -> f32,
44 | asinf : fn(f32) -> f32,
45 | acosf : fn(f32) -> f32,
46 | atanf : fn(f32) -> f32,
47 | erff : fn(f32) -> f32,
48 | atan2f : fn(f32, f32) -> f32,
49 | copysignf : fn(f32, f32) -> f32,
50 | fmaf : fn(f32, f32, f32) -> f32,
51 | fmaxf : fn(f32, f32) -> f32,
52 | fminf : fn(f32, f32) -> f32,
53 | fmodf : fn(f32, f32) -> f32,
54 | floorf : fn(f32) -> f32,
55 | isinff : fn(f32) -> i32,
56 | isnanf : fn(f32) -> i32,
57 | isfinitef : fn(f32) -> i32,
58 | exp : fn(f64) -> f64,
59 | exp2 : fn(f64) -> f64,
60 | log : fn(f64) -> f64,
61 | log2 : fn(f64) -> f64,
62 | pow : fn(f64, f64) -> f64,
63 | rsqrt : fn(f64) -> f64,
64 | sqrt : fn(f64) -> f64,
65 | fabs : fn(f64) -> f64,
66 | sin : fn(f64) -> f64,
67 | cos : fn(f64) -> f64,
68 | tan : fn(f64) -> f64,
69 | asin : fn(f64) -> f64,
70 | acos : fn(f64) -> f64,
71 | atan : fn(f64) -> f64,
72 | erf : fn(f64) -> f64,
73 | atan2 : fn(f64, f64) -> f64,
74 | copysign : fn(f64, f64) -> f64,
75 | fma : fn(f64, f64, f64) -> f64,
76 | fmax : fn(f64, f64) -> f64,
77 | fmin : fn(f64, f64) -> f64,
78 | fmod : fn(f64, f64) -> f64,
79 | floor : fn(f64) -> f64,
80 | isinf : fn(f64) -> i32,
81 | isnan : fn(f64) -> i32,
82 | isfinite : fn(f64) -> i32,
83 | min : fn(i32, i32) -> i32,
84 | max : fn(i32, i32) -> i32,
85 | }
86 |
--------------------------------------------------------------------------------
/platforms/impala/intrinsics_cpu.impala:
--------------------------------------------------------------------------------
1 | extern "C" {
2 | //fn "sinf" cpu_sinf(f32) -> f32;
3 | //fn "cosf" cpu_cosf(f32) -> f32;
4 | fn "tanf" cpu_tanf(f32) -> f32;
5 | fn "asinf" cpu_asinf(f32) -> f32;
6 | fn "acosf" cpu_acosf(f32) -> f32;
7 | fn "atanf" cpu_atanf(f32) -> f32;
8 | fn "erff" cpu_erff(f32) -> f32;
9 | fn "fmodf" cpu_fmodf(f32, f32) -> f32;
10 | fn "atan2f" cpu_atan2f(f32, f32) -> f32;
11 | fn "anydsl_isinff" cpu_isinff(f32) -> i32;
12 | fn "anydsl_isnanf" cpu_isnanf(f32) -> i32;
13 | fn "anydsl_isfinitef" cpu_isfinitef(f32) -> i32;
14 | //fn "sin" cpu_sin(f64) -> f64;
15 | //fn "cos" cpu_cos(f64) -> f64;
16 | fn "tan" cpu_tan(f64) -> f64;
17 | fn "asin" cpu_asin(f64) -> f64;
18 | fn "acos" cpu_acos(f64) -> f64;
19 | fn "atan" cpu_atan(f64) -> f64;
20 | fn "erf" cpu_erf(f64) -> f64;
21 | fn "fmod" cpu_fmod(f64, f64) -> f64;
22 | fn "atan2" cpu_atan2(f64, f64) -> f64;
23 | fn "anydsl_isinf" cpu_isinf(f64) -> i32;
24 | fn "anydsl_isnan" cpu_isnan(f64) -> i32;
25 | fn "anydsl_isfinite" cpu_isfinite(f64) -> i32;
26 | }
27 |
28 | extern "device" {
29 | fn "llvm.exp.f32" cpu_expf(f32) -> f32;
30 | fn "llvm.exp2.f32" cpu_exp2f(f32) -> f32;
31 | fn "llvm.log.f32" cpu_logf(f32) -> f32;
32 | fn "llvm.log2.f32" cpu_log2f(f32) -> f32;
33 | fn "llvm.pow.f32" cpu_powf(f32, f32) -> f32;
34 | fn "llvm.sqrt.f32" cpu_sqrtf(f32) -> f32;
35 | fn "llvm.fabs.f32" cpu_fabsf(f32) -> f32;
36 | fn "llvm.sin.f32" cpu_sinf(f32) -> f32;
37 | fn "llvm.cos.f32" cpu_cosf(f32) -> f32;
38 | fn "llvm.floor.f32" cpu_floorf(f32) -> f32;
39 | fn "llvm.fma.f32" cpu_fmaf(f32, f32, f32) -> f32;
40 | fn "llvm.fmuladd.f32" cpu_madf(f32, f32, f32) -> f32;
41 | fn "llvm.copysign.f32" cpu_copysignf(f32, f32) -> f32;
42 | fn "llvm.minnum.f32" cpu_fminf(f32, f32) -> f32;
43 | fn "llvm.maxnum.f32" cpu_fmaxf(f32, f32) -> f32;
44 | fn "llvm.exp.f64" cpu_exp(f64) -> f64;
45 | fn "llvm.exp2.f64" cpu_exp2(f64) -> f64;
46 | fn "llvm.log.f64" cpu_log(f64) -> f64;
47 | fn "llvm.log2.f64" cpu_log2(f64) -> f64;
48 | fn "llvm.pow.f64" cpu_pow(f64, f64) -> f64;
49 | fn "llvm.sqrt.f64" cpu_sqrt(f64) -> f64;
50 | fn "llvm.fabs.f64" cpu_fabs(f64) -> f64;
51 | fn "llvm.sin.f64" cpu_sin(f64) -> f64;
52 | fn "llvm.cos.f64" cpu_cos(f64) -> f64;
53 | fn "llvm.floor.f64" cpu_floor(f64) -> f64;
54 | fn "llvm.fma.f64" cpu_fma(f64, f64, f64) -> f64;
55 | fn "llvm.fmuladd.f64" cpu_mad(f64, f64, f64) -> f64;
56 | fn "llvm.copysign.f64" cpu_copysign(f64, f64) -> f64;
57 | fn "llvm.minnum.f64" cpu_fmin(f64, f64) -> f64;
58 | fn "llvm.maxnum.f64" cpu_fmax(f64, f64) -> f64;
59 | fn "llvm.ctpop.i32" cpu_popcount32(i32) -> i32;
60 | fn "llvm.ctpop.i64" cpu_popcount64(i64) -> i64;
61 | fn "llvm.ctlz.i32" cpu_clz32(i32, bool) -> i32;
62 | fn "llvm.ctlz.i64" cpu_clz64(i64, bool) -> i64;
63 | fn "llvm.cttz.i32" cpu_ctz32(i32, bool) -> i32;
64 | fn "llvm.cttz.i64" cpu_ctz64(i64, bool) -> i64;
65 | fn "llvm.x86.bmi.pext.32" cpu_pext32(i32, i32) -> i32;
66 | fn "llvm.x86.bmi.pext.64" cpu_pext64(i64, i64) -> i64;
67 | fn "llvm.x86.bmi.pdep.32" cpu_pdep32(i32, i32) -> i32;
68 | fn "llvm.x86.bmi.pdep.64" cpu_pdep64(i64, i64) -> i64;
69 | fn "llvm.prefetch.p0" cpu_prefetch(&u8, i32, i32, i32) -> ();
70 | }
71 |
72 | //
73 | // atomics
74 | // 0 1 2 3 4 5 6 7 8 9 10 11 12
75 | // operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
76 | // 0 1 2 4 5 6 7
77 | // ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
78 | // syncscope: singlethread "" (system)
79 | //
80 |
81 | fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 { atomic(0u32, a, b, 7u32, "") }
82 | fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32 { atomic(1u32, a, b, 7u32, "") }
83 | fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32 { atomic(2u32, a, b, 7u32, "") }
84 | fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32 { atomic(7u32, a, b, 7u32, "") }
85 | fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32 { atomic(8u32, a, b, 7u32, "") }
86 |
87 | static cpu_intrinsics = Intrinsics {
88 | expf : cpu_expf,
89 | exp2f : cpu_exp2f,
90 | logf : cpu_logf,
91 | log2f : cpu_log2f,
92 | powf : cpu_powf,
93 | rsqrtf : @|a| { 1.0f / cpu_sqrtf(a) },
94 | sqrtf : cpu_sqrtf,
95 | fabsf : cpu_fabsf,
96 | sinf : cpu_sinf,
97 | cosf : cpu_cosf,
98 | tanf : cpu_tanf,
99 | asinf : cpu_asinf,
100 | acosf : cpu_acosf,
101 | atanf : cpu_atanf,
102 | erff : cpu_erff,
103 | atan2f : cpu_atan2f,
104 | copysignf : cpu_copysignf,
105 | fmaf : cpu_fmaf,
106 | fmaxf : cpu_fmaxf,
107 | fminf : cpu_fminf,
108 | fmodf : cpu_fmodf,
109 | floorf : cpu_floorf,
110 | isinff : cpu_isinff,
111 | isnanf : cpu_isnanf,
112 | isfinitef : cpu_isfinitef,
113 | exp : cpu_exp,
114 | exp2 : cpu_exp2,
115 | log : cpu_log,
116 | log2 : cpu_log2,
117 | pow : cpu_pow,
118 | rsqrt : @|a| { 1.0 / cpu_sqrt(a) },
119 | sqrt : cpu_sqrt,
120 | fabs : cpu_fabs,
121 | sin : cpu_sin,
122 | cos : cpu_cos,
123 | tan : cpu_tan,
124 | asin : cpu_asin,
125 | acos : cpu_acos,
126 | atan : cpu_atan,
127 | erf : cpu_erf,
128 | atan2 : cpu_atan2,
129 | copysign : cpu_copysign,
130 | fma : cpu_fma,
131 | fmax : cpu_fmax,
132 | fmin : cpu_fmin,
133 | fmod : cpu_fmod,
134 | floor : cpu_floor,
135 | isinf : cpu_isinf,
136 | isnan : cpu_isnan,
137 | isfinite : cpu_isfinite,
138 | min : @|a, b| { if a < b { a } else { b } },
139 | max : @|a, b| { if a > b { a } else { b } },
140 | };
141 |
--------------------------------------------------------------------------------
/platforms/impala/intrinsics_hls.impala:
--------------------------------------------------------------------------------
1 | extern "device" {
2 | // no declarations are emitted for "device" functions
3 | fn "exp" hls_expf(f32) -> f32;
4 | fn "exp2" hls_exp2f(f32) -> f32;
5 | fn "log" hls_logf(f32) -> f32;
6 | fn "log2" hls_log2f(f32) -> f32;
7 | fn "pow" hls_powf(f32, f32) -> f32;
8 | fn "rsqrt" hls_rsqrtf(f32) -> f32;
9 | fn "sqrt" hls_sqrtf(f32) -> f32;
10 | fn "fabs" hls_fabsf(f32) -> f32;
11 | fn "sin" hls_sinf(f32) -> f32;
12 | fn "cos" hls_cosf(f32) -> f32;
13 | fn "tan" hls_tanf(f32) -> f32;
14 | fn "asin" hls_asinf(f32) -> f32;
15 | fn "acos" hls_acosf(f32) -> f32;
16 | fn "atan" hls_atanf(f32) -> f32;
17 | fn "erf" hls_erff(f32) -> f32;
18 | fn "atan2" hls_atan2f(f32, f32) -> f32;
19 | fn "fmod" hls_fmodf(f32, f32) -> f32;
20 | fn "floor" hls_floorf(f32) -> f32;
21 | fn "isinf" hls_isinff(f32) -> i32;
22 | fn "isnan" hls_isnanf(f32) -> i32;
23 | fn "isfinite" hls_isfinitef(f32) -> i32;
24 | fn "fma" hls_fmaf(f32, f32, f32) -> f32;
25 | fn "mad" hls_madf(f32, f32, f32) -> f32;
26 | fn "copysign" hls_copysignf(f32, f32) -> f32;
27 | fn "exp" hls_exp(f64) -> f64;
28 | fn "exp2" hls_exp2(f64) -> f64;
29 | fn "log" hls_log(f64) -> f64;
30 | fn "log2" hls_log2(f64) -> f64;
31 | fn "pow" hls_pow(f64, f64) -> f64;
32 | fn "rsqrt" hls_rsqrt(f64) -> f64;
33 | fn "sqrt" hls_sqrt(f64) -> f64;
34 | fn "fabs" hls_fabs(f64) -> f64;
35 | fn "sin" hls_sin(f64) -> f64;
36 | fn "cos" hls_cos(f64) -> f64;
37 | fn "tan" hls_tan(f64) -> f64;
38 | fn "asin" hls_asin(f64) -> f64;
39 | fn "acos" hls_acos(f64) -> f64;
40 | fn "atan" hls_atan(f64) -> f64;
41 | fn "erf" hls_erf(f64) -> f64;
42 | fn "atan2" hls_atan2(f64, f64) -> f64;
43 | fn "fmod" hls_fmod(f64, f64) -> f64;
44 | fn "floor" hls_floor(f64) -> f64;
45 | fn "isinf" hls_isinf(f64) -> i32;
46 | fn "isnan" hls_isnan(f64) -> i32;
47 | fn "isfinite" hls_isfinite(f64) -> i32;
48 | fn "fma" hls_fma(f64, f64, f64) -> f64;
49 | fn "mad" hls_mad(f64, f64, f64) -> f64;
50 | fn "copysign" hls_copysign(f64, f64) -> f64;
51 | fn "fmin" hls_fminf(f32, f32) -> f32;
52 | fn "fmax" hls_fmaxf(f32, f32) -> f32;
53 | fn "fmin" hls_fmin(f64, f64) -> f64;
54 | fn "fmax" hls_fmax(f64, f64) -> f64;
55 | fn "min" hls_min(i32, i32) -> i32;
56 | fn "max" hls_max(i32, i32) -> i32;
57 | }
58 |
59 | // channel scalar types
60 | struct channel_u8 { data : u8 };
61 | struct channel_i32 { data : i32 };
62 | struct channel_f32 { data : f32 };
63 |
64 | // channel array types
65 | struct channel1_u8 { data : [u8 * 1 ] };
66 | struct channel2_u8 { data : [u8 * 2 ] };
67 | struct channel4_u8 { data : [u8 * 4 ] };
68 | struct channel8_u8 { data : [u8 * 8 ] };
69 | struct channel16_u8 { data : [u8 * 16 ] };
70 | struct channel32_u8 { data : [u8 * 32 ] };
71 | struct channel64_u8 { data : [u8 * 64 ] };
72 | struct channel128_u8 { data : [u8 * 128] };
73 |
74 | struct channel1_i32 { data : [i32 * 1 ] };
75 | struct channel2_i32 { data : [i32 * 2 ] };
76 | struct channel4_i32 { data : [i32 * 4 ] };
77 | struct channel8_i32 { data : [i32 * 8 ] };
78 | struct channel16_i32 { data : [i32 * 16 ] };
79 | struct channel32_i32 { data : [i32 * 32 ] };
80 | struct channel64_i32 { data : [i32 * 64 ] };
81 | struct channel128_i32 { data : [i32 * 128] };
82 |
83 | struct channel1_f32 { data : [f32 * 1 ] };
84 | struct channel2_f32 { data : [f32 * 2 ] };
85 | struct channel4_f32 { data : [f32 * 4 ] };
86 | struct channel8_f32 { data : [f32 * 8 ] };
87 | struct channel16_f32 { data : [f32 * 16 ] };
88 | struct channel32_f32 { data : [f32 * 32 ] };
89 | struct channel64_f32 { data : [f32 * 64 ] };
90 | struct channel128_f32 { data : [f32 * 128] };
91 |
92 | extern "device" {
93 | fn print_pragma(&[u8]) -> ();
94 | // u8 scalar
95 | fn "read_channel" read_channel_u8 ( &mut channel_u8 ) -> u8 ;
96 | fn "write_channel" write_channel_u8 ( &mut channel_u8, u8) -> ();
97 |
98 | // u8 array
99 | fn "read_channel" read_channel1_u8 ( &mut channel1_u8 ) -> [u8 * 1 ];
100 | fn "read_channel" read_channel2_u8 ( &mut channel2_u8 ) -> [u8 * 2 ];
101 | fn "read_channel" read_channel4_u8 ( &mut channel4_u8 ) -> [u8 * 4 ];
102 | fn "read_channel" read_channel8_u8 ( &mut channel8_u8 ) -> [u8 * 8 ];
103 | fn "read_channel" read_channel16_u8 ( &mut channel16_u8 ) -> [u8 * 16 ];
104 | fn "read_channel" read_channel32_u8 ( &mut channel32_u8 ) -> [u8 * 32 ];
105 | fn "read_channel" read_channel64_u8 ( &mut channel64_u8 ) -> [u8 * 64 ];
106 | fn "read_channel" read_channel128_u8( &mut channel128_u8) -> [u8 * 128];
107 |
108 | fn "write_channel" write_channel1_u8 ( &mut channel1_u8, [u8 * 1 ] ) -> ();
109 | fn "write_channel" write_channel2_u8 ( &mut channel2_u8, [u8 * 2 ] ) -> ();
110 | fn "write_channel" write_channel4_u8 ( &mut channel4_u8, [u8 * 4 ] ) -> ();
111 | fn "write_channel" write_channel8_u8 ( &mut channel8_u8, [u8 * 8 ] ) -> ();
112 | fn "write_channel" write_channel16_u8 ( &mut channel16_u8, [u8 * 16 ] ) -> ();
113 | fn "write_channel" write_channel32_u8 ( &mut channel32_u8, [u8 * 32 ] ) -> ();
114 | fn "write_channel" write_channel64_u8 ( &mut channel64_u8, [u8 * 64 ] ) -> ();
115 | fn "write_channel" write_channel128_u8( &mut channel128_u8, [u8 * 128] ) -> ();
116 | fn " " bitcast_channel_u8 ( &mut channel1_u8) -> [u8 * 2 ];
117 |
118 | // i32 scalar
119 | fn "read_channel" read_channel_i32 ( &mut channel_i32 ) -> i32;
120 | fn "write_channel" write_channel_i32 ( &mut channel_i32, i32 ) -> ();
121 |
122 | // i32 array
123 | fn "read_channel" read_channel1_i32 ( &mut channel1_i32 ) -> [i32 * 1 ];
124 | fn "read_channel" read_channel2_i32 ( &mut channel2_i32 ) -> [i32 * 2 ];
125 | fn "read_channel" read_channel4_i32 ( &mut channel4_i32 ) -> [i32 * 4 ];
126 | fn "read_channel" read_channel8_i32 ( &mut channel8_i32 ) -> [i32 * 8 ];
127 | fn "read_channel" read_channel16_i32 ( &mut channel16_i32 ) -> [i32 * 16 ];
128 | fn "read_channel" read_channel32_i32 ( &mut channel32_i32 ) -> [i32 * 32 ];
129 | fn "read_channel" read_channel64_i32 ( &mut channel64_i32 ) -> [i32 * 64 ];
130 | fn "read_channel" read_channel128_i32( &mut channel128_i32) -> [i32 * 128];
131 |
132 | fn "write_channel" write_channel1_i32 ( &mut channel1_i32, [i32 * 1 ] )-> ();
133 | fn "write_channel" write_channel2_i32 ( &mut channel2_i32, [i32 * 2 ] ) -> ();
134 | fn "write_channel" write_channel4_i32 ( &mut channel4_i32, [i32 * 4 ] ) -> ();
135 | fn "write_channel" write_channel8_i32 ( &mut channel8_i32, [i32 * 8 ] ) -> ();
136 | fn "write_channel" write_channel16_i32 ( &mut channel16_i32, [i32 * 16 ] ) -> ();
137 | fn "write_channel" write_channel32_i32 ( &mut channel32_i32, [i32 * 32 ] ) -> ();
138 | fn "write_channel" write_channel64_i32 ( &mut channel64_i32, [i32 * 64 ] ) -> ();
139 | fn "write_channel" write_channel128_i32( &mut channel128_i32, [i32 * 128]) -> ();
140 | fn " " bitcast_channel_i32 ( &mut channel1_i32) -> [i32 * 2 ];
141 |
142 | // f32 scalar
143 | fn "read_channel" read_channel_f32 ( &mut channel_f32 ) -> f32;
144 | fn "write_channel" write_channel_f32 ( &mut channel_f32, f32 ) -> ();
145 |
146 | // f32 array
147 | fn "read_channel" read_channel1_f32 ( &mut channel1_f32 ) -> [f32 * 1 ];
148 | fn "read_channel" read_channel2_f32 ( &mut channel2_f32 ) -> [f32 * 2 ];
149 | fn "read_channel" read_channel4_f32 ( &mut channel4_f32 ) -> [f32 * 4 ];
150 | fn "read_channel" read_channel8_f32 ( &mut channel8_f32 ) -> [f32 * 8 ];
151 | fn "read_channel" read_channel16_f32 ( &mut channel16_f32 ) -> [f32 * 16 ];
152 | fn "read_channel" read_channel32_f32 ( &mut channel32_f32 ) -> [f32 * 32 ];
153 | fn "read_channel" read_channel64_f32 ( &mut channel64_f32 ) -> [f32 * 64 ];
154 | fn "read_channel" read_channel128_f32( &mut channel128_f32) -> [f32 * 128];
155 |
156 | fn "write_channel" write_channel1_f32 ( &mut channel1_f32, [f32 * 1 ]) -> ();
157 | fn "write_channel" write_channel2_f32 ( &mut channel2_f32, [f32 * 2 ]) -> ();
158 | fn "write_channel" write_channel4_f32 ( &mut channel4_f32, [f32 * 4 ]) -> ();
159 | fn "write_channel" write_channel8_f32 ( &mut channel8_f32, [f32 * 8 ]) -> ();
160 | fn "write_channel" write_channel16_f32 ( &mut channel16_f32, [f32 * 16 ]) -> ();
161 | fn "write_channel" write_channel32_f32 ( &mut channel32_f32, [f32 * 32 ]) -> ();
162 | fn "write_channel" write_channel64_f32 ( &mut channel64_f32, [f32 * 64 ]) -> ();
163 | fn "write_channel" write_channel128_f32( &mut channel128_f32, [f32 * 128]) -> ();
164 | fn " " bitcast_channel_f32 ( &mut channel1_f32) -> [f32 * 2 ];
165 | }
166 |
167 | fn @hls_accelerator(dev: i32) -> Accelerator {
168 | Accelerator {
169 | exec : @|grid, block, body| {
170 | let work_item = WorkItem {
171 | tidx : @|| 0, tidy : @|| 0, tidz : @|| 0,
172 | bidx : @|| 0, bidy : @|| 0, bidz : @|| 0,
173 | gidx : @|| 0, gidy : @|| 0, gidz : @|| 0,
174 | bdimx : @|| 1, bdimy : @|| 1, bdimz : @|| 1,
175 | gdimx : @|| 1, gdimy : @|| 1, gdimz : @|| 1,
176 | nblkx : @|| 1, nblky : @|| 1, nblkz : @|| 1
177 | };
178 | hls(dev, || @@body(work_item));
179 | },
180 | sync : @|| synchronize_hls(dev),
181 | alloc : @|size| alloc_hls(dev, size),
182 | alloc_unified : @|size| alloc_hls_unified(dev, size),
183 | barrier : @|| ()
184 | }
185 | };
186 |
187 | static hls_intrinsics = Intrinsics {
188 | expf : hls_expf,
189 | exp2f : hls_exp2f,
190 | logf : hls_logf,
191 | log2f : hls_log2f,
192 | powf : hls_powf,
193 | rsqrtf : hls_rsqrtf,
194 | sqrtf : hls_sqrtf,
195 | fabsf : hls_fabsf,
196 | sinf : hls_sinf,
197 | cosf : hls_cosf,
198 | tanf : hls_tanf,
199 | asinf : hls_asinf,
200 | acosf : hls_acosf,
201 | atanf : hls_atanf,
202 | erff : hls_erff,
203 | atan2f : hls_atan2f,
204 | copysignf : hls_copysignf,
205 | fmaf : hls_fmaf,
206 | fmaxf : hls_fmaxf,
207 | fminf : hls_fminf,
208 | fmodf : hls_fmodf,
209 | floorf : hls_floorf,
210 | isinff : hls_isinff,
211 | isnanf : hls_isnanf,
212 | isfinitef : hls_isfinitef,
213 | exp : hls_exp,
214 | exp2 : hls_exp2,
215 | log : hls_log,
216 | log2 : hls_log2,
217 | pow : hls_pow,
218 | rsqrt : hls_rsqrt,
219 | sqrt : hls_sqrt,
220 | fabs : hls_fabs,
221 | sin : hls_sin,
222 | cos : hls_cos,
223 | tan : hls_tan,
224 | asin : hls_asin,
225 | acos : hls_acos,
226 | atan : hls_atan,
227 | erf : hls_erf,
228 | atan2 : hls_atan2,
229 | copysign : hls_copysign,
230 | fma : hls_fma,
231 | fmax : hls_fmax,
232 | fmin : hls_fmin,
233 | fmod : hls_fmod,
234 | floor : hls_floor,
235 | isinf : hls_isinf,
236 | isnan : hls_isnan,
237 | isfinite : hls_isfinite,
238 | min : hls_min,
239 | max : hls_max,
240 | };
241 |
--------------------------------------------------------------------------------
/platforms/impala/intrinsics_opencl.impala:
--------------------------------------------------------------------------------
1 | extern "device" {
2 | // no declarations are emitted for "device" functions
3 | fn "barrier" opencl_barrier(u32) -> ();
4 | fn "exp" opencl_expf(f32) -> f32;
5 | fn "exp2" opencl_exp2f(f32) -> f32;
6 | fn "log" opencl_logf(f32) -> f32;
7 | fn "log2" opencl_log2f(f32) -> f32;
8 | fn "pow" opencl_powf(f32, f32) -> f32;
9 | fn "rsqrt" opencl_rsqrtf(f32) -> f32;
10 | fn "sqrt" opencl_sqrtf(f32) -> f32;
11 | fn "fabs" opencl_fabsf(f32) -> f32;
12 | fn "sin" opencl_sinf(f32) -> f32;
13 | fn "cos" opencl_cosf(f32) -> f32;
14 | fn "tan" opencl_tanf(f32) -> f32;
15 | fn "asin" opencl_asinf(f32) -> f32;
16 | fn "acos" opencl_acosf(f32) -> f32;
17 | fn "atan" opencl_atanf(f32) -> f32;
18 | fn "erf" opencl_erff(f32) -> f32;
19 | fn "atan2" opencl_atan2f(f32, f32) -> f32;
20 | fn "fmod" opencl_fmodf(f32, f32) -> f32;
21 | fn "floor" opencl_floorf(f32) -> f32;
22 | fn "isinf" opencl_isinff(f32) -> i32;
23 | fn "isnan" opencl_isnanf(f32) -> i32;
24 | fn "isfinite" opencl_isfinitef(f32) -> i32;
25 | fn "fma" opencl_fmaf(f32, f32, f32) -> f32;
26 | fn "mad" opencl_madf(f32, f32, f32) -> f32;
27 | fn "copysign" opencl_copysignf(f32, f32) -> f32;
28 | fn "exp" opencl_exp(f64) -> f64;
29 | fn "exp2" opencl_exp2(f64) -> f64;
30 | fn "log" opencl_log(f64) -> f64;
31 | fn "log2" opencl_log2(f64) -> f64;
32 | fn "pow" opencl_pow(f64, f64) -> f64;
33 | fn "rsqrt" opencl_rsqrt(f64) -> f64;
34 | fn "sqrt" opencl_sqrt(f64) -> f64;
35 | fn "fabs" opencl_fabs(f64) -> f64;
36 | fn "sin" opencl_sin(f64) -> f64;
37 | fn "cos" opencl_cos(f64) -> f64;
38 | fn "tan" opencl_tan(f64) -> f64;
39 | fn "asin" opencl_asin(f64) -> f64;
40 | fn "acos" opencl_acos(f64) -> f64;
41 | fn "atan" opencl_atan(f64) -> f64;
42 | fn "erf" opencl_erf(f64) -> f64;
43 | fn "atan2" opencl_atan2(f64, f64) -> f64;
44 | fn "fmod" opencl_fmod(f64, f64) -> f64;
45 | fn "floor" opencl_floor(f64) -> f64;
46 | fn "isinf" opencl_isinf(f64) -> i32;
47 | fn "isnan" opencl_isnan(f64) -> i32;
48 | fn "isfinite" opencl_isfinite(f64) -> i32;
49 | fn "fma" opencl_fma(f64, f64, f64) -> f64;
50 | fn "mad" opencl_mad(f64, f64, f64) -> f64;
51 | fn "copysign" opencl_copysign(f64, f64) -> f64;
52 | fn "fmin" opencl_fminf(f32, f32) -> f32;
53 | fn "fmax" opencl_fmaxf(f32, f32) -> f32;
54 | fn "fmin" opencl_fmin(f64, f64) -> f64;
55 | fn "fmax" opencl_fmax(f64, f64) -> f64;
56 | fn "min" opencl_min(i32, i32) -> i32;
57 | fn "max" opencl_max(i32, i32) -> i32;
58 | fn "atomic_add" opencl_atomic_add_global(&mut[1]i32, i32) -> i32;
59 | fn "atomic_add" opencl_atomic_add_shared(&mut[3]i32, i32) -> i32;
60 | fn "atomic_min" opencl_atomic_min_global(&mut[1]i32, i32) -> i32;
61 | fn "atomic_min" opencl_atomic_min_shared(&mut[3]i32, i32) -> i32;
62 | fn "get_work_dim" opencl_get_work_dim() -> u32;
63 | fn "get_global_size" opencl_get_global_size(u32) -> u64;
64 | fn "get_global_id" opencl_get_global_id(u32) -> u64;
65 | fn "get_local_size" opencl_get_local_size(u32) -> u64;
66 | fn "get_local_id" opencl_get_local_id(u32) -> u64;
67 | fn "get_num_groups" opencl_get_num_groups(u32) -> u64;
68 | fn "get_group_id" opencl_get_group_id(u32) -> u64;
69 | fn "get_global_offset" opencl_get_global_offset(u32) -> u64;
70 | }
71 |
72 | fn @opencl_accelerator(dev: i32) -> Accelerator {
73 | Accelerator {
74 | exec : @|grid, block, body| {
75 | let work_item = WorkItem {
76 | tidx : @|| opencl_get_local_id(0u32) as i32,
77 | tidy : @|| opencl_get_local_id(1u32) as i32,
78 | tidz : @|| opencl_get_local_id(2u32) as i32,
79 | bidx : @|| opencl_get_group_id(0u32) as i32,
80 | bidy : @|| opencl_get_group_id(1u32) as i32,
81 | bidz : @|| opencl_get_group_id(2u32) as i32,
82 | gidx : @|| opencl_get_global_id(0u32) as i32,
83 | gidy : @|| opencl_get_global_id(1u32) as i32,
84 | gidz : @|| opencl_get_global_id(2u32) as i32,
85 | bdimx : @|| opencl_get_local_size(0u32) as i32,
86 | bdimy : @|| opencl_get_local_size(1u32) as i32,
87 | bdimz : @|| opencl_get_local_size(2u32) as i32,
88 | gdimx : @|| opencl_get_global_size(0u32) as i32,
89 | gdimy : @|| opencl_get_global_size(1u32) as i32,
90 | gdimz : @|| opencl_get_global_size(2u32) as i32,
91 | nblkx : @|| opencl_get_num_groups(0u32) as i32,
92 | nblky : @|| opencl_get_num_groups(1u32) as i32,
93 | nblkz : @|| opencl_get_num_groups(2u32) as i32
94 | };
95 | opencl(dev, grid, block, || @@body(work_item))
96 | },
97 | sync : @|| synchronize_opencl(dev),
98 | alloc : @|size| alloc_opencl(dev, size),
99 | alloc_unified : @|size| alloc_opencl_unified(dev, size),
100 | barrier : @|| opencl_barrier(1u32), // CLK_LOCAL_MEM_FENCE -> 1 // CLK_GLOBAL_MEM_FENCE -> 2
101 | }
102 | }
103 |
104 | static opencl_intrinsics = Intrinsics {
105 | expf : opencl_expf,
106 | exp2f : opencl_exp2f,
107 | logf : opencl_logf,
108 | log2f : opencl_log2f,
109 | powf : opencl_powf,
110 | rsqrtf : opencl_rsqrtf,
111 | sqrtf : opencl_sqrtf,
112 | fabsf : opencl_fabsf,
113 | sinf : opencl_sinf,
114 | cosf : opencl_cosf,
115 | tanf : opencl_tanf,
116 | asinf : opencl_asinf,
117 | acosf : opencl_acosf,
118 | atanf : opencl_atanf,
119 | erff : opencl_erff,
120 | atan2f : opencl_atan2f,
121 | copysignf : opencl_copysignf,
122 | fmaf : opencl_fmaf,
123 | fmaxf : opencl_fmaxf,
124 | fminf : opencl_fminf,
125 | fmodf : opencl_fmodf,
126 | floorf : opencl_floorf,
127 | isinff : opencl_isinff,
128 | isnanf : opencl_isnanf,
129 | isfinitef : opencl_isfinitef,
130 | exp : opencl_exp,
131 | exp2 : opencl_exp2,
132 | log : opencl_log,
133 | log2 : opencl_log2,
134 | pow : opencl_pow,
135 | rsqrt : opencl_rsqrt,
136 | sqrt : opencl_sqrt,
137 | fabs : opencl_fabs,
138 | sin : opencl_sin,
139 | cos : opencl_cos,
140 | tan : opencl_tan,
141 | asin : opencl_asin,
142 | acos : opencl_acos,
143 | atan : opencl_atan,
144 | erf : opencl_erf,
145 | atan2 : opencl_atan2,
146 | copysign : opencl_copysign,
147 | fma : opencl_fma,
148 | fmax : opencl_fmax,
149 | fmin : opencl_fmin,
150 | fmod : opencl_fmod,
151 | floor : opencl_floor,
152 | isinf : opencl_isinf,
153 | isnan : opencl_isnan,
154 | isfinite : opencl_isfinite,
155 | min : opencl_min,
156 | max : opencl_max,
157 | };
158 |
--------------------------------------------------------------------------------
/platforms/impala/intrinsics_rv.impala:
--------------------------------------------------------------------------------
1 | extern "C" {
2 | fn rv_mask() -> bool;
3 | fn rv_any(bool) -> bool;
4 | fn rv_all(bool) -> bool;
5 | fn rv_ballot(bool) -> i32;
6 | fn rv_extract(f32, i32) -> f32;
7 | fn rv_insert(f32, i32, f32) -> f32;
8 | fn rv_load(&f32, i32) -> f32;
9 | fn rv_store(&mut f32, i32, f32) -> ();
10 | fn rv_shuffle(f32, i32) -> f32;
11 | fn rv_align(&i8, i32)-> &i8;
12 | fn rv_compact(f32, bool) -> f32;
13 | fn rv_lane_id() -> i32;
14 | fn rv_num_lanes() -> i32;
15 | }
16 |
--------------------------------------------------------------------------------
/platforms/impala/intrinsics_thorin.impala:
--------------------------------------------------------------------------------
1 | extern "thorin" {
2 | fn pe_info[T](&[u8], T) -> ();
3 |
4 | fn alignof[T]() -> i64;
5 | fn sizeof[T]() -> i64;
6 | fn undef[T]() -> T;
7 |
8 | fn bitcast[D, S](S) -> D;
9 | fn select[T, U](T, U, U) -> U;
10 | fn insert[T, U](T, i32, U) -> T;
11 | //fn shuffle[T](T, T, T) -> T;
12 |
13 | fn cuda(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
14 | fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
15 | fn opencl(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
16 | fn amdgpu_hsa(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
17 | fn amdgpu_pal(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
18 | fn reserve_shared[T](i32) -> &mut[3][T];
19 |
20 | fn hls(dev: i32, body: fn() -> ()) -> ();
21 | fn pipeline(i32, i32, i32, fn(i32) -> ()) -> (); // only for HLS/OpenCL backend
22 | fn parallel(num_threads: i32, lower: i32, upper: i32, body: fn(i32) -> ()) -> ();
23 | fn spawn(body: fn() -> ()) -> i32;
24 | fn sync(id: i32) -> ();
25 |
26 | fn atomic[T](binop: u32, addr: &mut T, val: T, order: u32, scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
27 | fn atomic_load[T](addr: &T, order: u32, scope: &[u8]) -> T;
28 | fn atomic_store[T](addr: &mut T, val: T, order: u32, scope: &[u8]) -> ();
29 | fn cmpxchg[T](addr: &mut T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); // only for integer data types
30 | fn cmpxchg_weak[T](addr: &mut T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool); // only for integer data types
31 | fn fence(order: u32, scope: &[u8]) -> ();
32 |
33 | fn "atomic" atomic_p1[T](binop: u32, addr: &mut [1]T, val: T, order: u32, scope: &[u8]) -> T;
34 | fn "atomic" atomic_p3[T](binop: u32, addr: &mut [3]T, val: T, order: u32, scope: &[u8]) -> T;
35 | fn "atomic_load" atomic_load_p1[T](addr: &[1]T, order: u32, scope: &[u8]) -> T;
36 | fn "atomic_load" atomic_load_p3[T](addr: &[3]T, order: u32, scope: &[u8]) -> T;
37 | fn "atomic_store" atomic_store_p1[T](addr: &mut [1]T, val: T, order: u32, scope: &[u8]) -> ();
38 | fn "atomic_store" atomic_store_p3[T](addr: &mut [3]T, val: T, order: u32, scope: &[u8]) -> ();
39 | fn "cmpxchg" cmpxchg_p1[T](addr: &mut [1]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
40 | fn "cmpxchg" cmpxchg_p3[T](addr: &mut [3]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
41 | fn "cmpxchg_weak" cmpxchg_weak_p1[T](addr: &mut [1]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
42 | fn "cmpxchg_weak" cmpxchg_weak_p3[T](addr: &mut [3]T, cmp: T, new: T, success_order: u32, failure_order: u32, scope: &[u8]) -> (T, bool);
43 |
44 | fn vectorize(vector_length: i32, body: fn(i32) -> ()) -> ();
45 | }
46 |
--------------------------------------------------------------------------------
/platforms/impala/runtime.impala:
--------------------------------------------------------------------------------
1 | extern "C" {
2 | fn "anydsl_info" runtime_info() -> ();
3 | fn "anydsl_device_name" runtime_device_name(_device: i32) -> &[u8];
4 | fn "anydsl_device_check_feature_support" runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool;
5 |
6 | fn "anydsl_alloc" runtime_alloc(i32, i64) -> &[i8];
7 | fn "anydsl_alloc_host" runtime_alloc_host(i32, i64) -> &[i8];
8 | fn "anydsl_alloc_unified" runtime_alloc_unified(i32, i64) -> &[i8];
9 | fn "anydsl_copy" runtime_copy(i32, &[i8], i64, i32, &[i8], i64, i64) -> ();
10 | fn "anydsl_get_device_ptr" runtime_get_device_ptr(i32, &[i8]) -> &[i8];
11 | fn "anydsl_release" runtime_release(i32, &[i8]) -> ();
12 | fn "anydsl_release_host" runtime_release_host(i32, &[i8]) -> ();
13 | fn "anydsl_synchronize" runtime_synchronize(i32) -> ();
14 |
15 | fn "anydsl_random_seed" random_seed(u32) -> ();
16 | fn "anydsl_random_val_f32" random_val_f32() -> f32;
17 | fn "anydsl_random_val_u64" random_val_u64() -> u64;
18 |
19 | fn "anydsl_get_micro_time" get_micro_time() -> i64;
20 | fn "anydsl_get_nano_time" get_nano_time() -> i64;
21 | fn "anydsl_get_kernel_time" get_kernel_time() -> i64;
22 |
23 | fn "anydsl_print_i16" print_i16(i16) -> ();
24 | fn "anydsl_print_i32" print_i32(i32) -> ();
25 | fn "anydsl_print_i64" print_i64(i64) -> ();
26 | fn "anydsl_print_u16" print_u16(u16) -> ();
27 | fn "anydsl_print_u32" print_u32(u32) -> ();
28 | fn "anydsl_print_u64" print_u64(u64) -> ();
29 | fn "anydsl_print_f32" print_f32(f32) -> ();
30 | fn "anydsl_print_f64" print_f64(f64) -> ();
31 | fn "anydsl_print_char" print_char(u8) -> ();
32 | fn "anydsl_print_string" print_string(&[u8]) -> ();
33 | fn "anydsl_print_flush" print_flush() -> ();
34 | }
35 |
36 | struct Buffer {
37 | data : &[i8],
38 | size : i64,
39 | device : i32
40 | }
41 |
42 | fn @alloc(dev: i32, size: i64) -> Buffer {
43 | Buffer {
44 | device : dev,
45 | data : runtime_alloc(dev, size),
46 | size : size
47 | }
48 | }
49 | fn @alloc_host(dev: i32, size: i64) -> Buffer {
50 | Buffer {
51 | device : dev,
52 | data : runtime_alloc_host(dev, size),
53 | size : size
54 | }
55 | }
56 | fn @alloc_unified(dev: i32, size: i64) -> Buffer {
57 | Buffer {
58 | device : dev,
59 | data : runtime_alloc_unified(dev, size),
60 | size : size
61 | }
62 | }
63 | fn @release(buf: Buffer) -> () { runtime_release(buf.device, buf.data) }
64 |
65 | fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
66 |
67 | fn @alloc_cpu(size: i64) -> Buffer { alloc(0, size) }
68 | fn @alloc_cuda(dev: i32, size: i64) -> Buffer { alloc(runtime_device(1, dev), size) }
69 | fn @alloc_cuda_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(1, dev), size) }
70 | fn @alloc_cuda_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(1, dev), size) }
71 | fn @synchronize_cuda(dev: i32) -> () { runtime_synchronize(runtime_device(1, dev)) }
72 | fn @alloc_opencl(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
73 | fn @alloc_opencl_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
74 | fn @synchronize_opencl(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
75 | fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
76 | fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
77 | fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
78 | fn @alloc_hsa(dev: i32, size: i64) -> Buffer { alloc(runtime_device(3, dev), size) }
79 | fn @alloc_hsa_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(3, dev), size) }
80 | fn @alloc_hsa_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(3, dev), size) }
81 | fn @synchronize_hsa(dev: i32) -> () { runtime_synchronize(runtime_device(3, dev)) }
82 | fn @alloc_pal(dev: i32, size: i64) -> Buffer { alloc(runtime_device(4, dev), size) }
83 | fn @alloc_pal_host(dev: i32, size: i64) -> Buffer { alloc_host(runtime_device(4, dev), size) }
84 | fn @alloc_pal_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(4, dev), size) }
85 | fn @synchronize_pal(dev: i32) -> () { runtime_synchronize(runtime_device(4, dev)) }
86 |
87 | fn @copy(src: Buffer, dst: Buffer) -> () {
88 | runtime_copy(src.device, src.data, 0i64, dst.device, dst.data, 0i64, src.size)
89 | }
90 |
91 | fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) -> () {
92 | runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size)
93 | }
94 |
95 |
96 | // range, range_step, unroll, unroll_step, etc.
97 | fn @(?lower & ?upper & ?step) unroll_step(lower: i32, upper: i32, @step: i32, body: fn(i32) -> ()) -> () {
98 | if lower < upper {
99 | @@body(lower);
100 | unroll_step(lower+step, upper, step, body)
101 | }
102 | }
103 |
104 | fn @(?upper & ?lower & ?step) unroll_step_rev(upper: i32, lower: i32, @step: i32, body: fn(i32) -> ()) -> () {
105 | if upper > lower {
106 | @@body(upper);
107 | unroll_step_rev(upper-step, lower, step, body)
108 | }
109 | }
110 |
111 | fn @range(lower: i32, upper: i32, body: fn(i32) -> ()) -> () { unroll_step($lower, $upper, 1, body) }
112 | fn @range_step(lower: i32, upper: i32, step: i32, body: fn(i32) -> ()) -> () { unroll_step($lower, $upper, step, body) }
113 | fn @range_rev(upper: i32, lower: i32, body: fn(i32) -> ()) -> () { unroll_step_rev($upper, $lower, 1, body) }
114 |
115 | fn @unroll(lower: i32, upper: i32, body: fn(i32) -> ()) -> () { unroll_step(lower, upper, 1, body) }
116 | fn @unroll_rev(upper: i32, lower: i32, body: fn(i32) -> ()) -> () { unroll_step_rev(upper, lower, 1, body) }
117 |
--------------------------------------------------------------------------------
/post-patcher.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sys, re, os
3 | basename = sys.argv[1]
4 | def patch_llvmir(rttype):
5 | # we need to patch
6 | result = []
7 | filename = basename+"."+rttype
8 | if os.path.isfile(filename):
9 | with open(filename) as f:
10 | for line in f:
11 | if rttype=="amdgpu" or rttype=="nvvm" or rttype=="ll":
12 | # patch to opaque identity functions
13 | m = re.match(r'^declare (.*) @(magic_.*_id)\((.*)\) (?:local_)?unnamed_addr(?: #[0-9]+)?\n$', line)
14 | if m is not None:
15 | ty1, fname, ty2 = m.groups()
16 | assert ty1 == ty2, "Argument and return types of magic IDs must match"
17 | print("Patching magic ID {0} in {1}".format(fname, filename))
18 | # emit definition instead
19 | result.append('define {0} @{1}({0} %name) {{\n'.format(ty1, fname))
20 | result.append(' ret {0} %name\n'.format(ty1))
21 | result.append('}\n')
22 | continue
23 |
24 | result.append(line)
25 | # we have the patched thing, write it
26 | with open(filename, "w") as f:
27 | for line in result:
28 | f.write(line)
29 | return
30 |
31 | def patch_cfiles(rttype):
32 | # we need to patch
33 | channel_line = {}
34 | channel_type = {}
35 | result = []
36 | channel_decl_name = None
37 | channel_decl_type = None
38 | channel_decl_line = 0
39 | if rttype == "cuda":
40 | filename = basename+"."+"cu"
41 | elif rttype == "opencl":
42 | filename = basename+"."+"cl"
43 | elif rttype == "hls":
44 | filename = basename+"."+"hls"
45 |
46 | if os.path.isfile(filename):
47 | with open(filename) as f:
48 | for line in f:
49 | # patch to opaque identity functions
50 | m = re.match(r'^(.*) = (magic_.*_id)\((.*)\);\n$', line)
51 | if m is not None:
52 | lhs, fname, arg = m.groups()
53 | print("Patching magic ID {0} in {1}".format(fname, filename))
54 | # emit definition instead
55 | result.append('{0} = {1};\n'.format(lhs, arg))
56 | else:
57 | result.append(line)
58 |
59 | # we have the patched thing, write it
60 | with open(filename, "w") as f:
61 | for line in result:
62 | f.write(line)
63 | return
64 |
65 | def patch_defs(rttype):
66 | nvvm_defs = {
67 | }
68 |
69 | if rttype == "nvvm":
70 | result = []
71 | filename = basename+".nvvm"
72 | if os.path.isfile(filename):
73 | with open(filename) as f:
74 | for line in f:
75 | matched = False
76 |
77 | for (func, code) in iter(nvvm_defs.items()):
78 | m = re.match(r'^declare (.*) (@' + func + r')\((.*)\)\n$', line)
79 | if m is not None:
80 | result.append(code)
81 | matched = True
82 | break
83 |
84 | if not matched:
85 | result.append(line)
86 |
87 | with open(filename, "w") as f:
88 | for line in result:
89 | f.write(line)
90 | return
91 |
92 | patch_llvmir("ll")
93 | patch_llvmir("amdgpu")
94 | patch_llvmir("nvvm")
95 | patch_cfiles("cuda")
96 | patch_cfiles("opencl")
97 | patch_cfiles("hls")
98 | patch_defs("nvvm")
99 |
--------------------------------------------------------------------------------
/src/anydsl_jit.h:
--------------------------------------------------------------------------------
1 | #ifndef ANYDSL_JIT_H
2 | #define ANYDSL_JIT_H
3 |
4 | #include
5 |
6 | #include "anydsl_runtime_config.h"
7 |
8 | class Runtime;
9 |
10 | AnyDSL_runtime_API Runtime& runtime();
11 |
12 | #ifdef AnyDSL_runtime_HAS_JIT_SUPPORT
13 | AnyDSL_runtime_jit_API void anydsl_set_cache_directory(const char*);
14 | AnyDSL_runtime_jit_API const char* anydsl_get_cache_directory();
15 | AnyDSL_runtime_jit_API void anydsl_link(const char*);
16 | AnyDSL_runtime_jit_API int32_t anydsl_compile(const char*, uint32_t, uint32_t);
17 | AnyDSL_runtime_jit_API void *anydsl_lookup_function(int32_t, const char*);
18 | AnyDSL_runtime_jit_API void anydsl_set_log_level(uint32_t /* log level (4=error only, 3=warn, 2=info, 1=verbose, 0=debug) */);
19 | #endif
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/src/anydsl_runtime.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #include "anydsl_runtime.h"
8 | // Make sure the definition for runtime() matches
9 | // the declaration in anydsl_jit.h
10 | #include "anydsl_jit.h"
11 |
12 | #include "runtime.h"
13 | #include "platform.h"
14 | #include "dummy_platform.h"
15 | #include "cpu_platform.h"
16 |
17 | #ifdef AnyDSL_runtime_HAS_TBB_SUPPORT
18 | #define NOMINMAX
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #else
25 | #include
26 | #endif
27 |
28 | struct RuntimeSingleton {
29 | Runtime runtime;
30 |
31 | RuntimeSingleton()
32 | : runtime(detect_profile_level())
33 | {
34 | runtime.register_platform();
35 | register_cuda_platform(&runtime);
36 | register_opencl_platform(&runtime);
37 | register_hsa_platform(&runtime);
38 | register_pal_platform(&runtime);
39 | register_levelzero_platform(&runtime);
40 | }
41 |
42 | static std::pair detect_profile_level() {
43 | auto profile = std::make_pair(ProfileLevel::None, ProfileLevel::None);
44 | const char* env_var = std::getenv("ANYDSL_PROFILE");
45 | if (env_var) {
46 | std::string env_str = env_var;
47 | for (auto& c: env_str)
48 | c = std::toupper(c, std::locale());
49 | std::stringstream profile_levels(env_str);
50 | std::string level;
51 | while (profile_levels >> level) {
52 | if (level == "FULL")
53 | profile.first = ProfileLevel::Full;
54 | else if (level == "FPGA_DYNAMIC")
55 | profile.second = ProfileLevel::Fpga_dynamic;
56 | }
57 | }
58 | return profile;
59 | }
60 | };
61 |
62 | Runtime& runtime() {
63 | static RuntimeSingleton singleton;
64 | return singleton.runtime;
65 | }
66 |
67 | inline PlatformId to_platform(int32_t m) {
68 | return PlatformId(m & 0x0F);
69 | }
70 |
71 | inline DeviceId to_device(int32_t m) {
72 | return DeviceId(m >> 4);
73 | }
74 |
75 | void anydsl_info(void) {
76 | runtime().display_info();
77 | }
78 |
79 | const char* anydsl_device_name(int32_t mask) {
80 | return runtime().device_name(to_platform(mask), to_device(mask));
81 | }
82 |
83 | bool anydsl_device_check_feature_support(int32_t mask, const char* feature) {
84 | return runtime().device_check_feature_support(to_platform(mask), to_device(mask), feature);
85 | }
86 |
87 | void* anydsl_alloc(int32_t mask, int64_t size) {
88 | return runtime().alloc(to_platform(mask), to_device(mask), size);
89 | }
90 |
91 | void* anydsl_alloc_host(int32_t mask, int64_t size) {
92 | return runtime().alloc_host(to_platform(mask), to_device(mask), size);
93 | }
94 |
95 | void* anydsl_alloc_unified(int32_t mask, int64_t size) {
96 | return runtime().alloc_unified(to_platform(mask), to_device(mask), size);
97 | }
98 |
99 | void* anydsl_get_device_ptr(int32_t mask, void* ptr) {
100 | return runtime().get_device_ptr(to_platform(mask), to_device(mask), ptr);
101 | }
102 |
103 | void anydsl_release(int32_t mask, void* ptr) {
104 | runtime().release(to_platform(mask), to_device(mask), ptr);
105 | }
106 |
107 | void anydsl_release_host(int32_t mask, void* ptr) {
108 | runtime().release_host(to_platform(mask), to_device(mask), ptr);
109 | }
110 |
111 | void anydsl_copy(
112 | int32_t mask_src, const void* src, int64_t offset_src,
113 | int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) {
114 | runtime().copy(
115 | to_platform(mask_src), to_device(mask_src), src, offset_src,
116 | to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size);
117 | }
118 |
119 | void anydsl_launch_kernel(
120 | int32_t mask, const char* file_name, const char* kernel_name,
121 | const uint32_t* grid, const uint32_t* block,
122 | void** arg_data,
123 | const uint32_t* arg_sizes,
124 | const uint32_t* arg_aligns,
125 | const uint32_t* arg_alloc_sizes,
126 | const uint8_t* arg_types,
127 | uint32_t num_args) {
128 | LaunchParams launch_params = {
129 | file_name,
130 | kernel_name,
131 | grid,
132 | block,
133 | {
134 | arg_data,
135 | arg_sizes,
136 | arg_aligns,
137 | arg_alloc_sizes,
138 | reinterpret_cast(arg_types),
139 | },
140 | num_args
141 | };
142 | runtime().launch_kernel(to_platform(mask), to_device(mask), launch_params);
143 | }
144 |
145 | void anydsl_synchronize(int32_t mask) {
146 | runtime().synchronize(to_platform(mask), to_device(mask));
147 | }
148 |
149 | uint64_t anydsl_get_micro_time() {
150 | using namespace std::chrono;
151 | return duration_cast(steady_clock::now().time_since_epoch()).count();
152 | }
153 |
154 | uint64_t anydsl_get_nano_time() {
155 | using namespace std::chrono;
156 | return duration_cast(steady_clock::now().time_since_epoch()).count();
157 | }
158 |
159 | uint64_t anydsl_get_kernel_time() {
160 | return runtime().kernel_time().load();
161 | }
162 |
163 | int32_t anydsl_isinff(float x) { return std::isinf(x); }
164 | int32_t anydsl_isnanf(float x) { return std::isnan(x); }
165 | int32_t anydsl_isfinitef(float x) { return std::isfinite(x); }
166 | int32_t anydsl_isinf(double x) { return std::isinf(x); }
167 | int32_t anydsl_isnan(double x) { return std::isnan(x); }
168 | int32_t anydsl_isfinite(double x) { return std::isfinite(x); }
169 |
170 | void anydsl_print_i16(int16_t s) { std::cout << s; }
171 | void anydsl_print_i32(int32_t i) { std::cout << i; }
172 | void anydsl_print_i64(int64_t l) { std::cout << l; }
173 | void anydsl_print_u16(uint16_t s) { std::cout << s; }
174 | void anydsl_print_u32(uint32_t i) { std::cout << i; }
175 | void anydsl_print_u64(uint64_t l) { std::cout << l; }
176 | void anydsl_print_f32(float f) { std::cout << f; }
177 | void anydsl_print_f64(double d) { std::cout << d; }
178 | void anydsl_print_char(char c) { std::cout << c; }
179 | void anydsl_print_string(char* s) { std::cout << s; }
180 | void anydsl_print_flush() { std::cout << std::flush; }
181 |
182 | void* anydsl_aligned_malloc(size_t size, size_t align) {
183 | return Runtime::aligned_malloc(size, align);
184 | }
185 |
186 | void anydsl_aligned_free(void* ptr) {
187 | return Runtime::aligned_free(ptr);
188 | }
189 |
190 | #ifndef __has_feature
191 | #define __has_feature(x) 0
192 | #endif
193 | #if (defined (__clang__) && !__has_feature(cxx_thread_local))
194 | #pragma message("Runtime random function is not thread-safe")
195 | static std::mt19937 std_gen;
196 | #else
197 | static thread_local std::mt19937 std_gen;
198 | #endif
199 | static std::uniform_real_distribution std_dist_f32;
200 | static std::uniform_int_distribution std_dist_u64;
201 |
202 | void anydsl_random_seed(uint32_t seed) {
203 | std_gen.seed(seed);
204 | }
205 |
206 | float anydsl_random_val_f32() {
207 | return std_dist_f32(std_gen);
208 | }
209 |
210 | uint64_t anydsl_random_val_u64() {
211 | return std_dist_u64(std_gen);
212 | }
213 |
214 | #ifndef AnyDSL_runtime_HAS_TBB_SUPPORT // C++11 threads version
215 | static std::unordered_map thread_pool;
216 | static std::vector free_ids;
217 | static std::mutex thread_lock;
218 |
219 | void anydsl_parallel_for(int32_t num_threads, int32_t lower, int32_t upper, void* args, void* fun) {
220 | // Get number of available hardware threads
221 | if (num_threads == 0) {
222 | num_threads = std::thread::hardware_concurrency();
223 | // hardware_concurrency is implementation defined, may return 0
224 | num_threads = (num_threads == 0) ? 1 : num_threads;
225 | }
226 |
227 | void (*fun_ptr) (void*, int32_t, int32_t) = reinterpret_cast(fun);
228 | const int32_t linear = (upper - lower) / num_threads;
229 |
230 | // Create a pool of threads to execute the task
231 | std::vector pool(num_threads);
232 |
233 | for (int i = 0, a = lower, b = lower + linear; i < num_threads - 1; a = b, b += linear, i++) {
234 | pool[i] = std::thread([=]() {
235 | fun_ptr(args, a, b);
236 | });
237 | }
238 |
239 | pool[num_threads - 1] = std::thread([=]() {
240 | fun_ptr(args, lower + (num_threads - 1) * linear, upper);
241 | });
242 |
243 | // Wait for all the threads to finish
244 | for (int i = 0; i < num_threads; i++)
245 | pool[i].join();
246 | }
247 |
248 | int32_t anydsl_spawn_thread(void* args, void* fun) {
249 | std::lock_guard lock(thread_lock);
250 |
251 | int32_t (*fun_ptr) (void*) = reinterpret_cast(fun);
252 |
253 | int32_t id;
254 | if (free_ids.size()) {
255 | id = free_ids.back();
256 | free_ids.pop_back();
257 | } else {
258 | id = static_cast(thread_pool.size());
259 | }
260 |
261 | auto spawned = std::make_pair(id, std::thread([=](){ fun_ptr(args); }));
262 | thread_pool.emplace(std::move(spawned));
263 | return id;
264 | }
265 |
266 | void anydsl_sync_thread(int32_t id) {
267 | auto thread = thread_pool.end();
268 | {
269 | std::lock_guard lock(thread_lock);
270 | thread = thread_pool.find(id);
271 | }
272 | if (thread != thread_pool.end()) {
273 | thread->second.join();
274 | {
275 | std::lock_guard lock(thread_lock);
276 | free_ids.push_back(thread->first);
277 | thread_pool.erase(thread);
278 | }
279 | } else {
280 | assert(0 && "Trying to synchronize on invalid thread id");
281 | }
282 | }
283 | #else // TBB version
284 | void anydsl_parallel_for(int32_t num_threads, int32_t lower, int32_t upper, void* args, void* fun) {
285 | tbb::task_arena limited((num_threads == 0) ? tbb::task_arena::automatic : num_threads);
286 | tbb::task_group tg;
287 |
288 | void (*fun_ptr) (void*, int32_t, int32_t) = reinterpret_cast(fun);
289 |
290 | limited.execute([&] {
291 | tg.run([&] {
292 | tbb::parallel_for(tbb::blocked_range(lower, upper),
293 | [=] (const tbb::blocked_range& range) {
294 | fun_ptr(args, range.begin(), range.end());
295 | });
296 | });
297 | });
298 |
299 | limited.execute([&] { tg.wait(); });
300 | }
301 |
302 | typedef tbb::concurrent_unordered_map> task_group_map;
303 | typedef std::pair task_group_node_ref;
304 | static task_group_map task_pool;
305 | static tbb::concurrent_queue free_ids;
306 | static std::mutex thread_lock;
307 |
308 | int32_t anydsl_spawn_thread(void* args, void* fun) {
309 | std::lock_guard lock(thread_lock);
310 | int32_t id = -1;
311 | if (!free_ids.try_pop(id)) {
312 | id = int32_t(task_pool.size());
313 | }
314 |
315 | int32_t(*fun_ptr) (void*) = reinterpret_cast(fun);
316 |
317 | assert(id >= 0);
318 |
319 | task_group_node_ref p = task_pool.emplace(std::piecewise_construct, std::forward_as_tuple(id), std::forward_as_tuple());
320 | tbb::task_group& tg = p.first->second;
321 |
322 | tg.run([=] { fun_ptr(args); });
323 |
324 | return id;
325 | }
326 |
327 | void anydsl_sync_thread(int32_t id) {
328 | auto task = task_pool.end();
329 | {
330 | std::lock_guard lock(thread_lock);
331 | task = task_pool.find(id);
332 | }
333 | if (task != task_pool.end()) {
334 | task->second.wait();
335 | {
336 | std::lock_guard lock(thread_lock);
337 | free_ids.push(task->first);
338 | }
339 | } else {
340 | assert(0 && "Trying to synchronize on invalid task id");
341 | }
342 | }
343 | #endif
344 |
--------------------------------------------------------------------------------
/src/anydsl_runtime.h:
--------------------------------------------------------------------------------
1 | #ifndef ANYDSL_RUNTIME_H
2 | #define ANYDSL_RUNTIME_H
3 |
4 | #include
5 | #include
6 |
7 | #include "anydsl_runtime_config.h"
8 |
9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 |
13 | #define ANYDSL_DEVICE(p, d) ((p) | ((d) << 4))
14 |
15 | enum {
16 | ANYDSL_HOST = 0,
17 | ANYDSL_CUDA = 1,
18 | ANYDSL_OPENCL = 2,
19 | ANYDSL_HSA = 3,
20 | ANYDSL_PAL = 4,
21 | ANYDSL_LEVELZERO = 5
22 | };
23 |
24 | AnyDSL_runtime_API void anydsl_info(void);
25 |
26 | AnyDSL_runtime_API const char* anydsl_device_name(int32_t);
27 | AnyDSL_runtime_API bool anydsl_device_check_feature_support(int32_t, const char*);
28 |
29 | AnyDSL_runtime_API void* anydsl_alloc(int32_t, int64_t);
30 | AnyDSL_runtime_API void* anydsl_alloc_host(int32_t, int64_t);
31 | AnyDSL_runtime_API void* anydsl_alloc_unified(int32_t, int64_t);
32 | AnyDSL_runtime_API void* anydsl_get_device_ptr(int32_t, void*);
33 | AnyDSL_runtime_API void anydsl_release(int32_t, void*);
34 | AnyDSL_runtime_API void anydsl_release_host(int32_t, void*);
35 |
36 | AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t);
37 |
38 | AnyDSL_runtime_API void anydsl_launch_kernel(
39 | int32_t, const char*, const char*,
40 | const uint32_t*, const uint32_t*,
41 | void**, const uint32_t*, const uint32_t*, const uint32_t*, const uint8_t*,
42 | uint32_t);
43 | AnyDSL_runtime_API void anydsl_synchronize(int32_t);
44 |
45 | AnyDSL_runtime_API void anydsl_random_seed(uint32_t);
46 | AnyDSL_runtime_API float anydsl_random_val_f32();
47 | AnyDSL_runtime_API uint64_t anydsl_random_val_u64();
48 |
49 | AnyDSL_runtime_API uint64_t anydsl_get_micro_time();
50 | AnyDSL_runtime_API uint64_t anydsl_get_nano_time();
51 | AnyDSL_runtime_API uint64_t anydsl_get_kernel_time();
52 |
53 | AnyDSL_runtime_API int32_t anydsl_isinff(float);
54 | AnyDSL_runtime_API int32_t anydsl_isnanf(float);
55 | AnyDSL_runtime_API int32_t anydsl_isfinitef(float);
56 | AnyDSL_runtime_API int32_t anydsl_isinf(double);
57 | AnyDSL_runtime_API int32_t anydsl_isnan(double);
58 | AnyDSL_runtime_API int32_t anydsl_isfinite(double);
59 |
60 | AnyDSL_runtime_API void anydsl_print_i16(int16_t);
61 | AnyDSL_runtime_API void anydsl_print_i32(int32_t);
62 | AnyDSL_runtime_API void anydsl_print_i64(int64_t);
63 | AnyDSL_runtime_API void anydsl_print_u16(uint16_t);
64 | AnyDSL_runtime_API void anydsl_print_u32(uint32_t);
65 | AnyDSL_runtime_API void anydsl_print_u64(uint64_t);
66 | AnyDSL_runtime_API void anydsl_print_f32(float);
67 | AnyDSL_runtime_API void anydsl_print_f64(double);
68 | AnyDSL_runtime_API void anydsl_print_char(char);
69 | AnyDSL_runtime_API void anydsl_print_string(char*);
70 | AnyDSL_runtime_API void anydsl_print_flush();
71 |
72 | AnyDSL_runtime_API void* anydsl_aligned_malloc(size_t, size_t);
73 | AnyDSL_runtime_API void anydsl_aligned_free(void*);
74 |
75 | AnyDSL_runtime_API void anydsl_parallel_for(int32_t, int32_t, int32_t, void*, void*);
76 | AnyDSL_runtime_API int32_t anydsl_spawn_thread(void*, void*);
77 | AnyDSL_runtime_API void anydsl_sync_thread(int32_t);
78 |
79 | struct AnyDSL_runtime_API Closure {
80 | void (*fn)(uint64_t);
81 | uint64_t payload;
82 | };
83 |
84 | AnyDSL_runtime_API int32_t anydsl_create_graph();
85 | AnyDSL_runtime_API int32_t anydsl_create_task(int32_t, Closure);
86 | AnyDSL_runtime_API void anydsl_create_edge(int32_t, int32_t);
87 | AnyDSL_runtime_API void anydsl_execute_graph(int32_t, int32_t);
88 |
89 | #ifdef __cplusplus
90 | }
91 | #include "anydsl_runtime.hpp"
92 | #endif
93 |
94 | #endif
95 |
--------------------------------------------------------------------------------
/src/anydsl_runtime.hpp:
--------------------------------------------------------------------------------
1 | #ifndef ANYDSL_RUNTIME_HPP
2 | #define ANYDSL_RUNTIME_HPP
3 |
4 | #ifndef ANYDSL_RUNTIME_H
5 | #include "anydsl_runtime.h"
6 | #endif
7 |
8 | namespace anydsl {
9 |
10 | enum class Platform : int32_t {
11 | Host = ANYDSL_HOST,
12 | Cuda = ANYDSL_CUDA,
13 | OpenCL = ANYDSL_OPENCL,
14 | HSA = ANYDSL_HSA,
15 | PAL = ANYDSL_PAL,
16 | LevelZero = ANYDSL_LEVELZERO
17 | };
18 |
19 | struct Device {
20 | Device(int32_t id) : id(id) {}
21 | int32_t id;
22 | };
23 |
24 | inline int32_t make_device(Platform p, Device d) {
25 | return ANYDSL_DEVICE((int32_t)p, d.id);
26 | }
27 |
28 | template
29 | class Array {
30 | public:
31 | Array()
32 | : data_(nullptr), size_(0), dev_(0)
33 | {}
34 |
35 | Array(int64_t size)
36 | : Array(Platform::Host, Device(0), size)
37 | {}
38 |
39 | Array(int32_t dev, T* ptr, int64_t size)
40 | : data_(ptr), size_(size), dev_(dev)
41 | {}
42 |
43 | Array(Platform p, Device d, int64_t size)
44 | : dev_(make_device(p, d)) {
45 | allocate(size);
46 | }
47 |
48 | Array(Array&& other)
49 | : data_(other.data_),
50 | size_(other.size_),
51 | dev_(other.dev_) {
52 | other.data_ = nullptr;
53 | }
54 |
55 | Array& operator = (Array&& other) {
56 | deallocate();
57 | dev_ = other.dev_;
58 | size_ = other.size_;
59 | data_ = other.data_;
60 | other.data_ = nullptr;
61 | return *this;
62 | }
63 |
64 | Array(const Array&) = delete;
65 | Array& operator = (const Array&) = delete;
66 |
67 | ~Array() { deallocate(); }
68 |
69 | T* begin() { return data_; }
70 | const T* begin() const { return data_; }
71 |
72 | T* end() { return data_ + size_; }
73 | const T* end() const { return data_ + size_; }
74 |
75 | T* data() { return data_; }
76 | const T* data() const { return data_; }
77 |
78 | int64_t size() const { return size_; }
79 | int32_t device() const { return dev_; }
80 |
81 | const T& operator [] (int i) const { return data_[i]; }
82 | T& operator [] (int i) { return data_[i]; }
83 |
84 | T* release() {
85 | T* ptr = data_;
86 | data_ = nullptr;
87 | size_ = 0;
88 | dev_ = 0;
89 | return ptr;
90 | }
91 |
92 | protected:
93 | void allocate(int64_t size) {
94 | size_ = size;
95 | data_ = (T*)anydsl_alloc(dev_, sizeof(T) * size);
96 | }
97 |
98 | void deallocate() {
99 | if (data_) anydsl_release(dev_, (void*)data_);
100 | }
101 |
102 | T* data_;
103 | int64_t size_;
104 | int32_t dev_;
105 | };
106 |
107 | template
108 | void copy(const Array& a, Array& b) {
109 | anydsl_copy(a.device(), (const void*)a.data(), 0,
110 | b.device(), (void*)b.data(), 0,
111 | a.size() * sizeof(T));
112 | }
113 |
114 | template
115 | void copy(const Array& a, Array& b, int64_t size) {
116 | anydsl_copy(a.device(), (const void*)a.data(), 0,
117 | b.device(), (void*)b.data(), 0,
118 | size * sizeof(T));
119 | }
120 |
121 | template
122 | void copy(const Array& a, int64_t offset_a, Array& b, int64_t offset_b, int64_t size) {
123 | anydsl_copy(a.device(), (const void*)a.data(), offset_a * sizeof(T),
124 | b.device(), (void*)b.data(), offset_b * sizeof(T),
125 | size * sizeof(T));
126 | }
127 |
128 | } // namespace anydsl
129 |
130 | #endif
131 |
--------------------------------------------------------------------------------
/src/anydsl_runtime_config.h.in:
--------------------------------------------------------------------------------
1 | #ifndef ANYDSL_RUNTIME_CONFIG_H
2 | #define ANYDSL_RUNTIME_CONFIG_H
3 |
4 | // AnyDSL runtime feature support
5 |
6 | #cmakedefine AnyDSL_runtime_BUILD_SHARED
7 | #cmakedefine AnyDSL_runtime_HAS_LLVM_SUPPORT
8 | #cmakedefine AnyDSL_runtime_HAS_JIT_SUPPORT
9 | #cmakedefine AnyDSL_runtime_HAS_CUDA_SUPPORT
10 | #cmakedefine AnyDSL_runtime_HAS_OPENCL_SUPPORT
11 | #cmakedefine AnyDSL_runtime_HAS_LEVELZERO_SUPPORT
12 | #cmakedefine AnyDSL_runtime_HAS_HSA_SUPPORT
13 | #cmakedefine AnyDSL_runtime_HAS_PAL_SUPPORT
14 | #cmakedefine AnyDSL_runtime_HAS_TBB_SUPPORT
15 |
16 |
17 | #if defined(AnyDSL_runtime_BUILD_SHARED)
18 | # ifdef _MSC_VER
19 | # define __dll_import __declspec(dllimport)
20 | # define __dll_export __declspec(dllexport)
21 | # else // _MSC_VER
22 | # define __dll_import __attribute__((visibility("default")))
23 | # define __dll_export __attribute__((visibility("default")))
24 | # endif // _MSC_VER
25 | # ifdef AnyDSL_runtime_EXPORTS
26 | # define AnyDSL_runtime_API __dll_export
27 | # else // AnyDSL_runtime_EXPORTS
28 | # define AnyDSL_runtime_API __dll_import
29 | # endif // AnyDSL_runtime_EXPORTS
30 | # ifdef AnyDSL_runtime_jit_EXPORTS
31 | # define AnyDSL_runtime_jit_API __dll_export
32 | # else // AnyDSL_runtime_jit_EXPORTS
33 | # define AnyDSL_runtime_jit_API __dll_import
34 | # endif // AnyDSL_runtime_jit_EXPORTS
35 | #else // AnyDSL_runtime_BUILD_SHARED
36 | # define AnyDSL_runtime_API
37 | # define AnyDSL_runtime_jit_API
38 | #endif // AnyDSL_runtime_BUILD_SHARED
39 |
40 |
41 | // CUDA support
42 |
43 | #cmakedefine AnyDSL_runtime_CUDA_CXX_STANDARD @AnyDSL_runtime_CUDA_CXX_STANDARD@
44 | #define AnyDSL_runtime_LIBDEVICE_LIB "@AnyDSL_runtime_LIBDEVICE_LIB@"
45 | #define AnyDSL_runtime_NVCC_INC "@AnyDSL_runtime_NVCC_INC@"
46 |
47 | // HSA support
48 |
49 | #define AnyDSL_runtime_HSA_BITCODE_PATH "@AnyDSL_runtime_HSA_BITCODE_PATH@/"
50 | #define AnyDSL_runtime_HSA_BITCODE_SUFFIX "@AnyDSL_runtime_HSA_BITCODE_SUFFIX@"
51 |
52 | // PAL support
53 |
54 | #define AnyDSL_runtime_PAL_BITCODE_PATH "@AnyDSL_runtime_PAL_BITCODE_PATH@/"
55 | #define AnyDSL_runtime_PAL_BITCODE_SUFFIX "@AnyDSL_runtime_PAL_BITCODE_SUFFIX@"
56 |
57 | // jit support
58 |
59 | #define AnyDSL_runtime_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@"
60 |
61 | // debug output
62 |
63 | #cmakedefine AnyDSL_runtime_ENABLE_DEBUG_OUTPUT
64 |
65 |
66 | #endif // ANYDSL_RUNTIME_CONFIG_H
67 |
--------------------------------------------------------------------------------
/src/cpu_platform.cpp:
--------------------------------------------------------------------------------
1 | #include "cpu_platform.h"
2 | #include "runtime.h"
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #if defined(__APPLE__)
11 | #include
12 | #include
13 | #elif defined(_WIN32)
14 | #define WIN32_LEAN_AND_MEAN
15 | #define NOMINMAX
16 | #include
17 | #endif
18 |
19 | CpuPlatform::CpuPlatform(Runtime* runtime)
20 | : Platform(runtime)
21 | {
22 | #if defined(__APPLE__)
23 | size_t buf_len;
24 | sysctlbyname("machdep.cpu.brand_string", nullptr, &buf_len, nullptr, 0);
25 | device_name_.resize(buf_len, '\0');
26 | sysctlbyname("machdep.cpu.brand_string", device_name_.data(), &buf_len, nullptr, 0);
27 | #elif defined(_WIN32)
28 | HKEY key;
29 | if (RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0U, KEY_QUERY_VALUE, &key) != ERROR_SUCCESS)
30 | error("failed to open processor information registry key");
31 |
32 | DWORD cpu_name_type, cpu_name_size;
33 | if (RegQueryValueExW(key, L"ProcessorNameString", nullptr, &cpu_name_type, nullptr, &cpu_name_size) != ERROR_SUCCESS)
34 | error("failed to query processor name string length");
35 |
36 | if (cpu_name_type != REG_SZ)
37 | error("unexpected type for processor name string");
38 |
39 | int cpu_name_length = cpu_name_size / sizeof(wchar_t);
40 |
41 | std::wstring buffer(cpu_name_length, '\0');
42 | if (RegQueryValueExW(key, L"ProcessorNameString", nullptr, &cpu_name_type, reinterpret_cast(buffer.data()), &cpu_name_size) != ERROR_SUCCESS)
43 | error("failed to query processor name string");
44 |
45 | RegCloseKey(key);
46 |
47 | int u8_cpu_name_length = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, buffer.data(), cpu_name_length, nullptr, 0, nullptr, nullptr);
48 |
49 | if (u8_cpu_name_length <= 0)
50 | error("failed to compute converted UTF-8 CPU name string length");
51 |
52 | device_name_.resize(u8_cpu_name_length, '\0');
53 |
54 | if (WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, buffer.data(), cpu_name_length, device_name_.data(), u8_cpu_name_length, nullptr, nullptr) <= 0)
55 | error("failed to convert CPU name string to UTF-8");
56 | #else
57 | std::ifstream cpuinfo("/proc/cpuinfo");
58 |
59 | if (!cpuinfo)
60 | error("failed to open /proc/cpuinfo");
61 |
62 | #if defined __arm__ || __aarch64__
63 | std::string model_string = "CPU part\t: ";
64 | #else // x86, x86_64
65 | std::string model_string = "model name\t: ";
66 | #endif
67 |
68 | std::search(std::istreambuf_iterator(cpuinfo), {}, model_string.begin(), model_string.end());
69 | std::getline(cpuinfo >> std::ws, device_name_);
70 | #endif
71 | }
72 |
--------------------------------------------------------------------------------
/src/cpu_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef CPU_PLATFORM_H
2 | #define CPU_PLATFORM_H
3 |
4 | #include "platform.h"
5 |
6 | #ifndef PAGE_SIZE
7 | #define PAGE_SIZE 4096
8 | #endif
9 |
10 | #include
11 |
12 | /// CPU platform, allocation is guaranteed to be aligned to page size: 4096 bytes.
13 | class CpuPlatform : public Platform {
14 | public:
15 | CpuPlatform(Runtime* runtime);
16 |
17 | protected:
18 | void* alloc(DeviceId, int64_t size) override {
19 | return Runtime::aligned_malloc(size, 32);
20 | }
21 |
22 | void* alloc_host(DeviceId, int64_t size) override {
23 | return Runtime::aligned_malloc(size, PAGE_SIZE);
24 | }
25 |
26 | void* alloc_unified(DeviceId, int64_t size) override {
27 | return Runtime::aligned_malloc(size, PAGE_SIZE);
28 | }
29 |
30 | void* get_device_ptr(DeviceId, void* ptr) override {
31 | return ptr;
32 | }
33 |
34 | void release(DeviceId, void* ptr) override {
35 | Runtime::aligned_free(ptr);
36 | }
37 |
38 | void release_host(DeviceId dev, void* ptr) override {
39 | release(dev, ptr);
40 | }
41 |
42 | void no_kernel() {
43 | error("Kernels are not supported on the CPU");
44 | }
45 |
46 | void launch_kernel(DeviceId, const LaunchParams&) override { no_kernel(); }
47 | void synchronize(DeviceId) override { no_kernel(); }
48 |
49 | void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) {
50 | memcpy((char*)dst + offset_dst, (char*)src + offset_src, size);
51 | }
52 |
53 | void copy(DeviceId, const void* src, int64_t offset_src,
54 | DeviceId, void* dst, int64_t offset_dst, int64_t size) override {
55 | copy(src, offset_src, dst, offset_dst, size);
56 | }
57 | void copy_from_host(const void* src, int64_t offset_src, DeviceId,
58 | void* dst, int64_t offset_dst, int64_t size) override {
59 | copy(src, offset_src, dst, offset_dst, size);
60 | }
61 | void copy_to_host(DeviceId, const void* src, int64_t offset_src,
62 | void* dst, int64_t offset_dst, int64_t size) override {
63 | copy(src, offset_src, dst, offset_dst, size);
64 | }
65 |
66 | std::string device_name_;
67 | size_t dev_count() const override { return 1; }
68 | std::string name() const override { return "CPU"; }
69 | const char* device_name(DeviceId) const override { return device_name_.c_str(); }
70 | bool device_check_feature_support(DeviceId, const char*) const override { return false; }
71 | };
72 |
73 | #endif
74 |
--------------------------------------------------------------------------------
/src/cuda_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef CUDA_PLATFORM_H
2 | #define CUDA_PLATFORM_H
3 |
4 | #include "platform.h"
5 | #include "runtime.h"
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #define CUDA_API_PER_THREAD_DEFAULT_STREAM
15 | #include
16 | #include
17 | #include
18 |
19 | #if CUDA_VERSION < 10000
20 | #error "CUDA 10.0 or higher required!"
21 | #endif
22 |
23 | /// CUDA platform. Has the same number of devices as that of the CUDA implementation.
24 | class CudaPlatform : public Platform {
25 | public:
26 | CudaPlatform(Runtime* runtime);
27 | ~CudaPlatform();
28 |
29 | protected:
30 | void* alloc(DeviceId dev, int64_t size) override;
31 | void* alloc_host(DeviceId dev, int64_t size) override;
32 | void* alloc_unified(DeviceId dev, int64_t size) override;
33 | void* get_device_ptr(DeviceId, void* ptr) override;
34 | void release(DeviceId dev, void* ptr) override;
35 | void release_host(DeviceId dev, void* ptr) override;
36 |
37 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
38 | void synchronize(DeviceId dev) override;
39 |
40 | void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
41 | void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
42 | void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
43 |
44 | size_t dev_count() const override { return devices_.size(); }
45 | std::string name() const override { return "CUDA"; }
46 | const char* device_name(DeviceId dev) const override;
47 | bool device_check_feature_support(DeviceId dev, const char* feature) const override;
48 |
49 | typedef std::unordered_map FunctionMap;
50 |
51 | struct DeviceData {
52 | CUdevice dev;
53 | CUcontext ctx;
54 | CUjit_target compute_capability;
55 | std::atomic_flag locked = ATOMIC_FLAG_INIT;
56 | std::unordered_map modules;
57 | std::unordered_map functions;
58 | std::string name;
59 |
60 | DeviceData() {}
61 | DeviceData(const DeviceData&) = delete;
62 | DeviceData(DeviceData&& data)
63 | : dev(data.dev)
64 | , ctx(data.ctx)
65 | , compute_capability(data.compute_capability)
66 | , modules(std::move(data.modules))
67 | , functions(std::move(data.functions))
68 | , name(std::move(name))
69 | {}
70 |
71 | void lock() {
72 | while (locked.test_and_set(std::memory_order_acquire)) ;
73 | }
74 |
75 | void unlock() {
76 | locked.clear(std::memory_order_release);
77 | }
78 | };
79 |
80 | std::vector devices_;
81 |
82 | bool dump_binaries = false;
83 |
84 | struct ProfileData {
85 | CudaPlatform* platform;
86 | CUcontext ctx;
87 | CUevent start;
88 | CUevent end;
89 | };
90 |
91 | std::mutex profile_lock_;
92 | std::forward_list profiles_;
93 | void erase_profiles(bool);
94 |
95 | CUfunction load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
96 |
97 | std::string compile_nvptx(DeviceId dev, const std::string& filename, const std::string& program_string) const;
98 | std::string compile_nvvm(DeviceId dev, const std::string& filename, const std::string& program_string) const;
99 | std::string compile_cuda(DeviceId dev, const std::string& filename, const std::string& program_string) const;
100 | CUmodule create_module(DeviceId dev, const std::string& filename, const std::string& ptx_string) const;
101 | };
102 |
103 | #endif
104 |
--------------------------------------------------------------------------------
/src/dummy_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef DUMMY_PLATFORM_H
2 | #define DUMMY_PLATFORM_H
3 |
4 | #include "platform.h"
5 | #include "runtime.h"
6 |
7 | #include
8 |
9 | /// Dummy platform, implemented
10 | class DummyPlatform : public Platform {
11 | public:
12 | DummyPlatform(Runtime* runtime, const std::string& name)
13 | : Platform(runtime), name_(name)
14 | {}
15 |
16 | protected:
17 | void* alloc(DeviceId, int64_t) override { platform_error(); }
18 | void* alloc_host(DeviceId, int64_t) override { platform_error(); }
19 | void* alloc_unified(DeviceId, int64_t) override { platform_error(); }
20 | void* get_device_ptr(DeviceId, void*) override { platform_error(); }
21 | void release(DeviceId, void*) override { platform_error(); }
22 | void release_host(DeviceId, void*) override { platform_error(); }
23 |
24 | void launch_kernel(DeviceId, const LaunchParams&) override { platform_error(); }
25 | void synchronize(DeviceId) override { platform_error(); }
26 |
27 | void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); }
28 | void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); }
29 | void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t) override { platform_error(); }
30 |
31 | size_t dev_count() const override { return 0; }
32 | std::string name() const override { return name_; }
33 | const char* device_name(DeviceId) const override { return "Dummy"; }
34 | bool device_check_feature_support(DeviceId, const char*) const override { return false; }
35 |
36 | std::string name_;
37 | };
38 |
39 | #endif
40 |
--------------------------------------------------------------------------------
/src/extract_runtime_srcs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sys
3 |
4 | def main():
5 | col, maxcols = 0, 10
6 | for f in sys.argv[1:]:
7 | with open(f, "r") as fd:
8 | for b in fd.read():
9 | sys.stdout.write("{:3}, ".format(ord(b)))
10 | col += 1
11 | if col == maxcols:
12 | sys.stdout.write("\n")
13 | col = 0
14 |
15 | if __name__ == "__main__":
16 | main()
17 |
--------------------------------------------------------------------------------
/src/hsa_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef HSA_PLATFORM_H
2 | #define HSA_PLATFORM_H
3 |
4 | #include "platform.h"
5 | #include "runtime.h"
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include
13 | #include
14 |
15 | namespace llvm {
16 | class OptimizationLevel;
17 | }
18 |
19 | /// HSA platform. Has the same number of devices as that of the HSA implementation.
20 | class HSAPlatform : public Platform {
21 | public:
22 | HSAPlatform(Runtime* runtime);
23 | ~HSAPlatform();
24 |
25 | protected:
26 | void* alloc(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].amd_coarsegrained_pool); }
27 | void* alloc_host(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].amd_coarsegrained_pool); }
28 | void* alloc_unified(DeviceId dev, int64_t size) override { return alloc_hsa(size, devices_[dev].finegrained_region); }
29 | void* get_device_ptr(DeviceId, void* ptr) override { return ptr; }
30 | void release(DeviceId dev, void* ptr) override;
31 | void release_host(DeviceId dev, void* ptr) override { release(dev, ptr); }
32 |
33 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
34 | void synchronize(DeviceId dev) override;
35 |
36 | void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size);
37 | void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
38 | void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
39 | void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
40 |
41 | size_t dev_count() const override { return devices_.size(); }
42 | std::string name() const override { return "HSA"; }
43 | const char* device_name(DeviceId dev) const override;
44 | bool device_check_feature_support(DeviceId, const char*) const override { return false; }
45 |
46 | struct KernelInfo {
47 | uint64_t kernel;
48 | uint32_t kernarg_segment_size;
49 | uint32_t group_segment_size;
50 | uint32_t private_segment_size;
51 | void* kernarg_segment;
52 | };
53 |
54 | typedef std::unordered_map KernelMap;
55 |
56 | struct DeviceData {
57 | hsa_agent_t agent;
58 | hsa_profile_t profile;
59 | hsa_default_float_rounding_mode_t float_mode;
60 | std::string isa;
61 | hsa_queue_t* queue;
62 | hsa_signal_t signal;
63 | hsa_region_t kernarg_region, finegrained_region, coarsegrained_region;
64 | hsa_amd_memory_pool_t amd_kernarg_pool, amd_finegrained_pool, amd_coarsegrained_pool;
65 | std::atomic_flag locked = ATOMIC_FLAG_INIT;
66 | std::unordered_map programs;
67 | std::unordered_map kernels;
68 | std::string name;
69 |
70 | DeviceData() {}
71 | DeviceData(const DeviceData&) = delete;
72 | DeviceData(DeviceData&& data)
73 | : agent(data.agent)
74 | , profile(data.profile)
75 | , float_mode(data.float_mode)
76 | , isa(data.isa)
77 | , queue(data.queue)
78 | , signal(data.signal)
79 | , kernarg_region(data.kernarg_region)
80 | , finegrained_region(data.finegrained_region)
81 | , coarsegrained_region(data.coarsegrained_region)
82 | , amd_kernarg_pool(data.amd_kernarg_pool)
83 | , amd_finegrained_pool(data.amd_finegrained_pool)
84 | , amd_coarsegrained_pool(data.amd_finegrained_pool)
85 | , programs(std::move(data.programs))
86 | , kernels(std::move(data.kernels))
87 | , name(data.name)
88 | {}
89 |
90 | void lock() {
91 | while (locked.test_and_set(std::memory_order_acquire)) ;
92 | }
93 |
94 | void unlock() {
95 | locked.clear(std::memory_order_release);
96 | }
97 | };
98 |
99 | uint64_t frequency_;
100 | std::vector devices_;
101 |
102 | void* alloc_hsa(int64_t, hsa_region_t);
103 | void* alloc_hsa(int64_t, hsa_amd_memory_pool_t);
104 | static hsa_status_t iterate_agents_callback(hsa_agent_t, void*);
105 | static hsa_status_t iterate_regions_callback(hsa_region_t, void*);
106 | static hsa_status_t iterate_memory_pools_callback(hsa_amd_memory_pool_t, void*);
107 | KernelInfo& load_kernel(DeviceId, const std::string&, const std::string&);
108 | std::string compile_gcn(DeviceId, const std::string&, const std::string&) const;
109 | std::string emit_gcn(const std::string&, const std::string&, const std::string&, llvm::OptimizationLevel) const;
110 | };
111 |
112 | #endif
113 |
--------------------------------------------------------------------------------
/src/jit.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 |
16 | #include
17 | #include
18 | #include
19 |
20 | #include "anydsl_jit.h"
21 | #include "log.h"
22 | #include "runtime.h"
23 |
24 | bool compile(
25 | const std::vector& file_names,
26 | const std::vector& file_data,
27 | thorin::World& world,
28 | std::ostream& error_stream);
29 |
30 | static const char runtime_srcs[] = {
31 | #include "runtime_srcs.inc"
32 | 0
33 | };
34 |
35 | struct JIT {
36 | struct Program {
37 | Program(llvm::ExecutionEngine* engine) : engine(engine) {}
38 | llvm::ExecutionEngine* engine;
39 | };
40 |
41 | std::vector programs;
42 | Runtime* runtime;
43 | thorin::LogLevel log_level;
44 |
45 | JIT(Runtime* runtime) : runtime(runtime), log_level(thorin::LogLevel::Warn) {
46 | llvm::InitializeNativeTarget();
47 | llvm::InitializeNativeTargetAsmPrinter();
48 | }
49 |
50 | int32_t compile(const char* program_src, uint32_t size, uint32_t opt) {
51 | // The LLVM context and module have to be alive for the duration of this function
52 | std::unique_ptr llvm_context;
53 | std::unique_ptr llvm_module;
54 |
55 | size_t prog_key = std::hash{}(program_src);
56 | std::stringstream hex_stream;
57 | hex_stream << std::hex << prog_key;
58 | std::string program_str = std::string(program_src, size);
59 | std::string cached_llvm = runtime->load_from_cache(program_str, ".llvm");
60 | std::string module_name = "jit_" + hex_stream.str();
61 | if (cached_llvm.empty()) {
62 | bool debug = false;
63 | assert(opt <= 3);
64 |
65 | thorin::Thorin thorin(module_name);
66 | thorin.world().set(log_level);
67 | thorin.world().set(std::make_shared(std::cerr));
68 | if (!::compile(
69 | { "runtime", module_name },
70 | { std::string(runtime_srcs), program_str },
71 | thorin.world(), std::cerr))
72 | error("JIT: error while compiling sources");
73 |
74 | thorin.opt();
75 |
76 | std::string host_triple, host_cpu, host_attr, hls_flags;
77 | thorin::DeviceBackends backends(thorin.world(), opt, debug, hls_flags);
78 |
79 | thorin::llvm::CPUCodeGen cg(thorin, opt, debug, host_triple, host_cpu, host_attr);
80 | std::tie(llvm_context, llvm_module) = cg.emit_module();
81 | std::stringstream stream;
82 | llvm::raw_os_ostream llvm_stream(stream);
83 | llvm_module->print(llvm_stream, nullptr);
84 | runtime->store_to_cache(program_str, stream.str(), ".llvm");
85 |
86 | for (auto& cg : backends.cgs) {
87 | if (cg) {
88 | if (std::string(cg->file_ext()) == ".hls")
89 | error("JIT compilation of hls not supported!");
90 | std::ostringstream stream;
91 | cg->emit_stream(stream);
92 | runtime->store_to_cache(cg->file_ext() + program_str, stream.str(), cg->file_ext());
93 | runtime->register_file(module_name + cg->file_ext(), stream.str());
94 | }
95 | }
96 | } else {
97 | llvm::SMDiagnostic diagnostic_err;
98 | llvm_context = std::make_unique();
99 | llvm_module = llvm::parseIR(llvm::MemoryBuffer::getMemBuffer(cached_llvm)->getMemBufferRef(), diagnostic_err, *llvm_context);
100 |
101 | auto load_backend_src = [&](std::string ext) {
102 | std::string cached_src = runtime->load_from_cache(ext + program_str, ext);
103 | if (!cached_src.empty())
104 | runtime->register_file(module_name + ext, cached_src);
105 | };
106 | load_backend_src(".cl");
107 | load_backend_src(".cu");
108 | load_backend_src(".nvvm");
109 | load_backend_src(".amdgpu");
110 | }
111 |
112 | llvm::TargetOptions options;
113 | options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
114 |
115 | auto engine = llvm::EngineBuilder(std::move(llvm_module))
116 | .setEngineKind(llvm::EngineKind::JIT)
117 | .setMCPU(llvm::sys::getHostCPUName())
118 | .setTargetOptions(options)
119 | .setOptLevel( opt == 0 ? llvm::CodeGenOptLevel::None :
120 | opt == 1 ? llvm::CodeGenOptLevel::Less :
121 | opt == 2 ? llvm::CodeGenOptLevel::Default :
122 | /* opt == 3 */ llvm::CodeGenOptLevel::Aggressive)
123 | .create();
124 | if (!engine)
125 | return -1;
126 |
127 | engine->finalizeObject();
128 | programs.push_back(Program(engine));
129 |
130 | return (int32_t)programs.size() - 1;
131 | }
132 |
133 | void* lookup_function(int32_t key, const char* fn_name) {
134 | if (key == -1)
135 | return nullptr;
136 |
137 | return (void *)programs[key].engine->getFunctionAddress(fn_name);
138 | }
139 |
140 | void link(const char* lib) {
141 | llvm::sys::DynamicLibrary::LoadLibraryPermanently(lib);
142 | }
143 | };
144 |
145 | JIT& jit() {
146 | static std::unique_ptr jit(new JIT(&runtime()));
147 | return *jit;
148 | }
149 |
150 | void anydsl_set_cache_directory(const char* dir) {
151 | jit().runtime->set_cache_directory(dir == nullptr ? std::string() : dir);
152 | }
153 |
154 | const char* anydsl_get_cache_directory() {
155 | static std::string dir;
156 | dir = jit().runtime->get_cache_directory();
157 | return dir.c_str();
158 | }
159 |
160 | void anydsl_link(const char* lib) {
161 | jit().link(lib);
162 | }
163 |
164 | int32_t anydsl_compile(const char* program, uint32_t size, uint32_t opt) {
165 | return jit().compile(program, size, opt);
166 | }
167 |
168 | void anydsl_set_log_level(uint32_t log_level) {
169 | jit().log_level = log_level <= 4 ? static_cast(log_level) : thorin::LogLevel::Warn;
170 | }
171 |
172 | void* anydsl_lookup_function(int32_t key, const char* fn_name) {
173 | return jit().lookup_function(key, fn_name);
174 | }
175 |
--------------------------------------------------------------------------------
/src/levelzero_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef LEVEL_ZERO_PLATFORM_H
2 | #define LEVEL_ZERO_PLATFORM_H
3 |
4 | #include "platform.h"
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include
12 |
13 |
14 | /// oneAPI Level Zero platform
15 | class LevelZeroPlatform : public Platform {
16 | public:
17 | LevelZeroPlatform(Runtime* runtime);
18 | ~LevelZeroPlatform();
19 |
20 | protected:
21 | void* alloc(DeviceId dev, int64_t size) override;
22 | void* alloc_host(DeviceId, int64_t) override;
23 | void* alloc_unified(DeviceId, int64_t) override;
24 | void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
25 | void release(DeviceId dev, void* ptr) override;
26 | void release_host(DeviceId, void*) override;
27 |
28 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
29 | void synchronize(DeviceId dev) override;
30 |
31 | void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
32 | void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
33 | void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
34 |
35 | size_t dev_count() const override { return devices_.size(); }
36 | std::string name() const override { return "oneAPI Level Zero"; }
37 | const char* device_name(DeviceId dev) const override;
38 | bool device_check_feature_support(DeviceId, const char*) const override { return false; }
39 |
40 | typedef std::unordered_map KernelMap;
41 |
42 | struct DeviceData {
43 | LevelZeroPlatform* parent;
44 | ze_driver_handle_t driver;
45 | ze_device_handle_t device;
46 | std::string device_name;
47 | ze_command_list_handle_t queue = nullptr;
48 | ze_context_handle_t ctx = nullptr;
49 | std::unordered_map modules;
50 | std::unordered_map kernels;
51 | double timerResolution;
52 |
53 | DeviceData(
54 | LevelZeroPlatform* parent,
55 | ze_driver_handle_t driver,
56 | ze_device_handle_t device,
57 | const std::string& device_name)
58 | : parent(parent)
59 | , driver(driver)
60 | , device(device)
61 | , device_name(device_name)
62 | {}
63 | DeviceData(DeviceData&&) = default;
64 | DeviceData(const DeviceData&) = delete;
65 | };
66 |
67 | std::vector devices_;
68 | std::vector contexts_;
69 |
70 | ze_kernel_handle_t load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
71 | friend void determineDeviceCapabilities(ze_device_handle_t hDevice, LevelZeroPlatform::DeviceData& device);
72 | };
73 |
74 | #endif
75 |
--------------------------------------------------------------------------------
/src/log.h:
--------------------------------------------------------------------------------
1 | #ifndef LOG_H
2 | #define LOG_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | inline void unused() {}
10 | template
11 | inline void unused(const T& t, Args... args) { (void)t; unused(args...); }
12 |
13 | inline void print(std::ostream& os, const char* fmt) {
14 | assert(!strchr(fmt, '%') && "Not enough arguments to print");
15 | os << fmt << std::endl;
16 | }
17 |
18 | template
19 | void print(std::ostream& os, const char* fmt, const T& t, Args... args) {
20 | auto ptr = strchr(fmt, '%');
21 | while (ptr && ptr[1] == '%') ptr = strchr(ptr + 2, '%');
22 | assert(ptr && "Too many arguments to print");
23 | os.write(fmt, ptr - fmt);
24 | os << t;
25 | print(os, ptr + 1, args...);
26 | }
27 |
28 | template
29 | [[noreturn]] void error(Args... args) {
30 | print(std::cerr, args...);
31 | std::abort();
32 | }
33 |
34 | template
35 | void info(Args... args) {
36 | print(std::cout, args...);
37 | }
38 |
39 | template
40 | void debug(Args... args) {
41 | #ifdef AnyDSL_runtime_ENABLE_DEBUG_OUTPUT
42 | print(std::cout, args...);
43 | #else
44 | unused(args...);
45 | #endif
46 | }
47 |
48 | #endif
49 |
--------------------------------------------------------------------------------
/src/opencl_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef OPENCL_PLATFORM_H
2 | #define OPENCL_PLATFORM_H
3 |
4 | #include "platform.h"
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #ifdef __APPLE__
12 | #include
13 | #include
14 | #else
15 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
16 | #include
17 | #include
18 | #endif
19 |
20 | /// OpenCL platform. Has the same number of devices as that of the OpenCL implementation.
21 | class OpenCLPlatform : public Platform {
22 | public:
23 | OpenCLPlatform(Runtime* runtime);
24 | ~OpenCLPlatform();
25 |
26 | protected:
27 | void* alloc(DeviceId dev, int64_t size) override;
28 | void* alloc_host(DeviceId, int64_t) override { command_unavailable("alloc_host"); }
29 | void* alloc_unified(DeviceId, int64_t) override;
30 | void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
31 | void release(DeviceId dev, void* ptr) override;
32 | void release_host(DeviceId, void*) override { command_unavailable("release_host"); }
33 |
34 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
35 | void synchronize(DeviceId dev) override;
36 |
37 | void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
38 | void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
39 | void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
40 | void copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size);
41 | void dynamic_profile(DeviceId dev, const std::string& filename);
42 |
43 | size_t dev_count() const override { return devices_.size(); }
44 | std::string name() const override { return "OpenCL"; }
45 | const char* device_name(DeviceId dev) const override;
46 | bool device_check_feature_support(DeviceId, const char*) const override { return false; }
47 |
48 | typedef std::unordered_map KernelMap;
49 |
50 | struct DeviceData {
51 | OpenCLPlatform* parent;
52 | cl_platform_id platform;
53 | cl_device_id dev;
54 | cl_uint version_major;
55 | cl_uint version_minor;
56 | std::string platform_name;
57 | std::string device_name;
58 | cl_command_queue queue = nullptr;
59 | cl_context ctx = nullptr;
60 | #ifdef CL_VERSION_2_0
61 | cl_device_svm_capabilities svm_caps;
62 | #endif
63 | bool is_intel_fpga = false;
64 | bool is_xilinx_fpga = false;
65 |
66 | std::unordered_map programs;
67 | std::unordered_map kernels;
68 | std::unordered_map kernels_queue;
69 |
70 | // Atomics do not have a move constructor. This structure introduces one.
71 | struct AtomicData {
72 | std::atomic_int timings_counter {};
73 | std::atomic_flag lock = ATOMIC_FLAG_INIT;
74 | AtomicData() = default;
75 | AtomicData(AtomicData&&) {}
76 | } atomic_data;
77 |
78 | DeviceData(
79 | OpenCLPlatform* parent,
80 | cl_platform_id platform,
81 | cl_device_id dev,
82 | cl_uint version_major,
83 | cl_uint version_minor,
84 | const std::string& platform_name,
85 | const std::string& device_name)
86 | : parent(parent)
87 | , platform(platform)
88 | , dev(dev)
89 | , version_major(version_major)
90 | , version_minor(version_minor)
91 | , platform_name(platform_name)
92 | , device_name(device_name)
93 | {}
94 | DeviceData(DeviceData&&) = default;
95 | DeviceData(const DeviceData&) = delete;
96 |
97 | void lock() {
98 | while (atomic_data.lock.test_and_set(std::memory_order_acquire)) ;
99 | }
100 |
101 | void unlock() {
102 | atomic_data.lock.clear(std::memory_order_release);
103 | }
104 | };
105 |
106 | std::vector devices_;
107 |
108 | cl_kernel load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
109 | cl_program load_program_binary(DeviceId dev, const std::string& filename, const std::string& program_string) const;
110 | cl_program load_program_il(DeviceId dev, const std::string& filename, const std::string& program_string) const;
111 | cl_program load_program_source(DeviceId dev, const std::string& filename, const std::string& program_string) const;
112 | cl_program compile_program(DeviceId dev, cl_program program, const std::string& filename) const;
113 |
114 | friend void time_kernel_callback(cl_event, cl_int, void*);
115 | };
116 |
117 | #endif
118 |
--------------------------------------------------------------------------------
/src/pal/pal_device.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_DEVICE_DATA_H
2 | #define PAL_DEVICE_DATA_H
3 |
4 | #include "../runtime.h"
5 | #include "pal_utils.h"
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 |
18 | class PALPlatform;
19 |
20 | class PalDevice {
21 | public:
22 | typedef Pal::gpusize GpuVirtAddr_t;
23 | typedef std::unordered_map KernelMap;
24 |
25 | enum class queue_and_cmd_buffer_type { Compute, Universal, Dma };
26 |
27 | PalDevice(){};
28 | PalDevice(Pal::IDevice* base_device, Runtime* runtime);
29 | PalDevice(const PalDevice&) = delete;
30 | PalDevice(PalDevice&& other)
31 | : runtime_(other.runtime_)
32 | , device_(other.device_)
33 | , cmd_allocator_(other.cmd_allocator_)
34 | , queue_(other.queue_)
35 | , cmd_buffer_(other.cmd_buffer_)
36 | , profiling_timestamps_(other.profiling_timestamps_)
37 | , timestamps_frequency_(other.timestamps_frequency_)
38 | , programs_(std::move(other.programs_))
39 | , kernels_(std::move(other.kernels_))
40 | , memory_objects_(std::move(other.memory_objects_))
41 | , gfx_level(other.gfx_level)
42 | , isa(std::move(other.isa))
43 | , name(std::move(other.name)) {}
44 |
45 | ~PalDevice();
46 |
47 | void lock() {
48 | while (locked_.test_and_set(std::memory_order_acquire))
49 | ;
50 | }
51 |
52 | void unlock() { locked_.clear(std::memory_order_release); }
53 |
54 | Pal::IPipeline* create_pipeline(const void* elf_data, size_t elf_data_size);
55 |
56 | // Allocates memory of the requested size on the requested gpu heap (controls visibility).
57 | // Returns the virtual gpu address of the allocated memory.
58 | GpuVirtAddr_t allocate_gpu_memory(Pal::gpusize size_in_bytes, Pal::GpuHeap heap);
59 |
60 | GpuVirtAddr_t allocate_shared_virtual_memory(Pal::gpusize sizeInBytes);
61 |
62 | void release_gpu_memory(GpuVirtAddr_t virtual_address);
63 | void release_gpu_memory(void* virtual_address) {
64 | release_gpu_memory(reinterpret_cast(virtual_address));
65 | }
66 |
67 | void copy_gpu_data(
68 | const GpuVirtAddr_t source, GpuVirtAddr_t destination, const Pal::MemoryCopyRegion& copy_region);
69 | void copy_gpu_data(const void* source, void* destination, const Pal::MemoryCopyRegion& copy_region) {
70 | copy_gpu_data(reinterpret_cast(source),
71 | reinterpret_cast(destination), copy_region);
72 | }
73 |
74 | void dispatch(const Pal::CmdBufferBuildInfo& cmd_buffer_build_info,
75 | const Pal::PipelineBindParams& pipeline_bind_params, const Pal::BarrierInfo& barrier_info,
76 | const LaunchParams& launch_params);
77 |
78 | void WaitIdle();
79 |
80 | private:
81 | friend PALPlatform;
82 |
83 | Pal::Result init();
84 |
85 | // Creates a PAL queue object and corresponding command buffer object into the given pointers.
86 | Pal::Result init_queue_and_cmd_buffer(queue_and_cmd_buffer_type type, Pal::IQueue*& queue, Pal::ICmdBuffer*& cmd_buffer);
87 |
88 | // Creates a PAL command allocator which is needed to allocate memory for all
89 | // command buffer objects.
90 | Pal::Result init_cmd_allocator();
91 |
92 | Pal::Result allocate_memory(Pal::gpusize size_in_bytes, Pal::GpuHeap heap,
93 | Pal::IGpuMemory** gpu_memory_pp, Pal::gpusize alignment = 256 * 1024);
94 |
95 | // Returns the key to the new map entry. This key is the virtual gpu memory address.
96 | GpuVirtAddr_t track_memory(Pal::IGpuMemory* memory);
97 | void forget_memory(GpuVirtAddr_t gpu_address);
98 | // Returns nullptr if key is not present in memory_objects map.
99 | Pal::IGpuMemory* get_memory_object(const GpuVirtAddr_t gpu_address) const;
100 | Pal::IGpuMemory* get_memory_object(const void* gpu_address) const {
101 | return get_memory_object(reinterpret_cast(gpu_address));
102 | }
103 |
104 | // Build a buffer holding the kernel arguments and upload to the GPU.
105 | // Returns the address of the buffer on the gpu.
106 | GpuVirtAddr_t build_kernargs_buffer(const ParamsArgs& params_args, int num_args, const char* kernel_name);
107 |
108 | // Helper function that allocates a gpu-only buffer of the given size and uploads the data written by the
109 | // write_callback
110 | PalDevice::GpuVirtAddr_t write_data_to_gpu(
111 | Pal::gpusize byte_size, std::function write_callback);
112 |
113 | uint32_t calculate_launch_params_size(const ParamsArgs& params_args, uint32_t num_args);
114 | // Write kernel arguments to memory. Returns the number of bytes occupied by the passed in kernel arguments.
115 | size_t write_launch_params(const ParamsArgs& params_args, uint32_t num_args, void* memory, size_t memory_size);
116 |
117 | private:
118 | Runtime* runtime_ = nullptr;
119 |
120 | Pal::IDevice* device_ = nullptr;
121 | Pal::ICmdAllocator* cmd_allocator_ = nullptr;
122 | Pal::IQueue* queue_ = nullptr;
123 | Pal::ICmdBuffer* cmd_buffer_ = nullptr;
124 |
125 | Pal::IQueue* dma_queue_ = nullptr;
126 | Pal::ICmdBuffer* dma_cmd_buffer_ = nullptr;
127 |
128 | struct ProfilingTimestamps {
129 | uint64_t start;
130 | uint64_t end;
131 | };
132 | Pal::IGpuMemory* profiling_timestamps_ = nullptr;
133 | uint64_t timestamps_frequency_ = 0;
134 |
135 | std::atomic_flag locked_ = ATOMIC_FLAG_INIT;
136 |
137 | std::unordered_map programs_;
138 | std::unordered_map kernels_;
139 |
140 | // Map virtual addresses on the GPU to the PAL objects representing the memory.
141 | // This is needed because AnyDSL assumes it deals with gpu-legal addresses in its API.
142 | // However, to interact with PAL we need to have the wrapper objects at hand.
143 | // The IGpuMemory objects should not be used outside of this class.
144 | std::unordered_map memory_objects_;
145 |
146 | public:
147 | Pal::GfxIpLevel gfx_level;
148 | std::string isa;
149 | std::string name;
150 | };
151 |
152 | #endif
--------------------------------------------------------------------------------
/src/pal/pal_fix_calling_convention_pass.cpp:
--------------------------------------------------------------------------------
1 | #include "pal_fix_calling_convention_pass.h"
2 | #include "pal_utils.h"
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 |
10 | using namespace llvm;
11 |
12 | void fix_calling_conv(Module* m, Function* f, std::unordered_set& traversed_functions) {
13 | if (traversed_functions.find(f) != traversed_functions.end()) {
14 | // already visited this function -> prevent recursive loop
15 | return;
16 | }
17 |
18 | traversed_functions.insert(f);
19 | f->addFnAttr(llvm::Attribute::AlwaysInline);
20 |
21 | // Find and inspect all function calls inside of this function
22 | for (auto& bb : *f) {
23 | for (auto& instruction : bb) {
24 | if (CallInst* call_inst = dyn_cast(&instruction)) {
25 | if (call_inst->getCallingConv() != CallingConv::AMDGPU_Gfx) {
26 | call_inst->setCallingConv(CallingConv::AMDGPU_Gfx);
27 | }
28 |
29 | if (Function* called_function = call_inst->getCalledFunction()) {
30 | fix_calling_conv(m, called_function, traversed_functions);
31 | }
32 | }
33 | }
34 | }
35 | }
36 |
37 | PreservedAnalyses PalPlatformFixCallingConventionPass::run(Module& M, ModuleAnalysisManager&) {
38 | std::unordered_set traversed_functions = {};
39 | for (Function& entrypoint_fn : M) {
40 | fix_calling_conv(&M, &entrypoint_fn, traversed_functions);
41 | }
42 | return PreservedAnalyses::all();
43 | }
44 |
--------------------------------------------------------------------------------
/src/pal/pal_fix_calling_convention_pass.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_PLATFORM_FIX_CALLING_CONVENTION_H
2 | #define PAL_PLATFORM_FIX_CALLING_CONVENTION_H
3 |
4 | #include
5 |
6 | /// This pass sets the calling convention to AMDGPU_Gfx for all calls in the given module and sets the AlwaysInline
7 | /// Attribute on every called function in the module to avoid the LLVM AMDGPU backend throwing errors.
8 | struct PalPlatformFixCallingConventionPass : llvm::PassInfoMixin {
9 | llvm::PreservedAnalyses run(llvm::Module& M, llvm::ModuleAnalysisManager&);
10 | };
11 |
12 | #endif // PAL_PLATFORM_FIX_CALLING_CONVENTION_H
--------------------------------------------------------------------------------
/src/pal/pal_insert_halt_pass.cpp:
--------------------------------------------------------------------------------
1 | #include "pal_insert_halt_pass.h"
2 | #include "pal_utils.h"
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | using namespace llvm;
12 |
13 | PreservedAnalyses PalPlatformInsertHaltPass::run(Function& F, FunctionAnalysisManager&) {
14 | char* halt_immediately = std::getenv("HALT_IMMEDIATELY");
15 | if (F.getName() != pal_utils::ComputeShaderMainFnName || !halt_immediately
16 | || strcmp(halt_immediately, "ON") != 0) {
17 | return PreservedAnalyses::all();
18 | }
19 | assert(F.getCallingConv() == CallingConv::AMDGPU_CS);
20 | LLVMContext& Ctx = F.getParent()->getContext();
21 | BasicBlock& EntryBlock = *F.begin();
22 | IRBuilder<> Builder(&(*EntryBlock.getFirstInsertionPt()));
23 | ArrayRef inline_asm_args;
24 | InlineAsm* inline_assembly = InlineAsm::get(
25 | FunctionType::get(Type::getVoidTy(Ctx), false), "s_sethalt 1", "", true, false, InlineAsm::AD_ATT);
26 | Builder.CreateCall(inline_assembly, inline_asm_args);
27 | return PreservedAnalyses::none();
28 | }
--------------------------------------------------------------------------------
/src/pal/pal_insert_halt_pass.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_PLATFORM_INSERT_HALT_PASS_H
2 | #define PAL_PLATFORM_INSERT_HALT_PASS_H
3 |
4 | #include
5 | #include
6 |
7 | /// Pass that inserts RDNA specific assembly to halt a shader as soon as it starts if the environment variable
8 | /// "HALT_IMMEDIATELY" is set to the value "ON"
9 | struct PalPlatformInsertHaltPass : llvm::PassInfoMixin {
10 | llvm::PreservedAnalyses run(llvm::Function& F, llvm::FunctionAnalysisManager& FAM);
11 | };
12 |
13 | #endif // PAL_PLATFORM_INSERT_HALT_PASS_H
--------------------------------------------------------------------------------
/src/pal/pal_lower_builtins_pass.cpp:
--------------------------------------------------------------------------------
1 | #include "pal_lower_builtins_pass.h"
2 | #include "pal_utils.h"
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | using namespace llvm;
18 |
19 | namespace {
20 | // anonymous namespace to avoid name clashes
21 |
22 | enum Builtins : int8_t {
23 | workitem_id_x = 0,
24 | workitem_id_y,
25 | workitem_id_z,
26 | workgroup_id_x,
27 | workgroup_id_y,
28 | workgroup_id_z,
29 | nblk_x,
30 | nblk_y,
31 | nblk_z,
32 | // Dynamic builtins (i.e., inlined code based on supplied metadata)
33 | workgroup_size_x,
34 | workgroup_size_y,
35 | workgroup_size_z,
36 | count
37 | };
38 |
39 | constexpr const char* BuiltinNames[] = {
40 | "anydsl.amdpal.workitem.id.x",
41 | "anydsl.amdpal.workitem.id.y",
42 | "anydsl.amdpal.workitem.id.z",
43 | "anydsl.amdpal.workgroup.id.x",
44 | "anydsl.amdpal.workgroup.id.y",
45 | "anydsl.amdpal.workgroup.id.z",
46 | "anydsl.amdpal.nblk.x",
47 | "anydsl.amdpal.nblk.y",
48 | "anydsl.amdpal.nblk.z",
49 | // Dynamic builtins (i.e., inlined code based on supplied metadata)
50 | "anydsl.amdpal.workgroup.size.x",
51 | "anydsl.amdpal.workgroup.size.y",
52 | "anydsl.amdpal.workgroup.size.z",
53 | };
54 |
55 | struct BuiltinAssemblyInfo {
56 | const char* asmString;
57 | const char* asmConstraints;
58 | };
59 |
60 | // PAL SGPR layout:
61 | // s0-1: PAL reserved data -> set up by PAL because of pipeline register configuration in PALPlatform
62 | // s2-3: pointer to pal kernel args (for compute shader)
63 | // -> set up by AnyDSL PALPlatform
64 | // s4-5: pointer to NumWorkGroups struct (i.e., nblk)
65 | // s6-12: reserved for future use
66 | // s13-15: work group id x, y, and z -> set up by AnyDSL PALPlatform by supplying pgm_rsrc2
67 | // ENABLE_SGPR_WORKGROUP_ID_ to PAL pipeline setup
68 |
69 | const BuiltinAssemblyInfo BuiltinAssemblyInfos[]{
70 | // workitem_id_x
71 | {
72 | .asmString = "; local thread id x is in v0",
73 | .asmConstraints = "={v0}",
74 | },
75 | // workitem_id_y
76 | {
77 | .asmString = "; local thread id y is in v1",
78 | .asmConstraints = "={v1}",
79 | },
80 | // workitem_id_z
81 | {
82 | .asmString = "; local thread id z is in v2",
83 | .asmConstraints = "={v2}",
84 | },
85 | // workgroup_id_x
86 | {
87 | .asmString = "; workgroup id x is in s13",
88 | .asmConstraints = "={s13}",
89 | },
90 | // workgroup_id_y
91 | {
92 | .asmString = "; workgroup id y is in s14",
93 | .asmConstraints = "={s14}",
94 | },
95 | // workgroup_id_z
96 | {
97 | .asmString = "; workgroup id z is in s15",
98 | .asmConstraints = "={s15}",
99 | },
100 | // nblk_x
101 | {
102 | .asmString = "s_load_dword $0, s[4:5], 0x00",
103 | .asmConstraints = "=s",
104 | },
105 | // nblk_y
106 | {
107 | .asmString = "s_load_dword $0, s[4:5], 0x04",
108 | .asmConstraints = "=s",
109 | },
110 | // nblk_z
111 | {
112 | .asmString = "s_load_dword $0, s[4:5], 0x08",
113 | .asmConstraints = "=s",
114 | },
115 | };
116 |
117 | typedef std::array, static_cast(Builtins::count)> BuiltinsCallInstMap;
118 |
119 | Builtins GetBuiltinID(Function* f) {
120 | const StringRef f_name = f->getName();
121 | for (int8_t i = 0; i < Builtins::count; ++i) {
122 | if (f_name == BuiltinNames[i]) {
123 | return Builtins(i);
124 | }
125 | }
126 | return Builtins::count;
127 | }
128 |
129 | const BuiltinAssemblyInfo& GetAssemblyInfo(Builtins builtinID) {
130 | return BuiltinAssemblyInfos[static_cast(builtinID)];
131 | }
132 |
133 | bool IsBuiltin(Function* f) { return GetBuiltinID(f) < Builtins::count; }
134 |
135 | void find_builtins_calls(Module* m, Function* f, BuiltinsCallInstMap& builtins_call_instances,
136 | std::unordered_set& traversed_functions) {
137 | if (traversed_functions.find(f) != traversed_functions.end()) {
138 | // already visited this function -> prevent recursive loop
139 | return;
140 | }
141 |
142 | traversed_functions.insert(f);
143 |
144 | // Find and inspect all function calls inside of this function
145 | for (auto& bb : *f) {
146 | for (auto& instruction : bb) {
147 | CallInst* callInst = dyn_cast(&instruction);
148 | if (!callInst) {
149 | continue;
150 | }
151 |
152 | Function* calledFunction = callInst->getCalledFunction();
153 | if (!calledFunction) {
154 | continue;
155 | }
156 |
157 | if (IsBuiltin(calledFunction)) {
158 | // If the call we found is calling a builtin, record the builtins usage
159 | Builtins builtinID = GetBuiltinID(calledFunction);
160 | builtins_call_instances[static_cast(builtinID)].push_back(callInst);
161 | } else if (calledFunction->getParent() == m) {
162 | // If the called function is within this module, recursively search it for
163 | // builtins used
164 | find_builtins_calls(m, calledFunction, builtins_call_instances, traversed_functions);
165 | }
166 | }
167 | }
168 | }
169 |
170 | Function* find_entrypoint(Module& M) {
171 | for (Function& F : M) {
172 | const auto name = F.getName();
173 | if (name.equals(pal_utils::ComputeShaderMainFnName))
174 | return &F;
175 | }
176 |
177 | return nullptr;
178 | }
179 |
180 | // Function taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
181 | BasicBlock::iterator getInsertPt(BasicBlock& BB) {
182 | BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
183 | for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
184 | AllocaInst* AI = dyn_cast(&*InsPt);
185 |
186 | // If this is a dynamic alloca, the value may depend on the loaded kernargs,
187 | // so loads will need to be inserted before it.
188 | if (!AI || !AI->isStaticAlloca())
189 | break;
190 | }
191 |
192 | return InsPt;
193 | }
194 |
195 | CallInst* insert_asm(
196 | IRBuilder<>& Builder, LLVMContext& Ctx, const char* asm_string, const char* asm_constraint) {
197 | ArrayRef inline_asm_args;
198 | InlineAsm* inline_assembly = InlineAsm::get(FunctionType::get(Type::getInt32Ty(Ctx), false), asm_string,
199 | asm_constraint, true, false, InlineAsm::AD_ATT);
200 | return Builder.CreateCall(inline_assembly, inline_asm_args);
201 | }
202 |
203 | // Inserts assembly code to split the local thread id from v0 into v0(x), v1(y) and v2(z).
204 | // This is only applicable for GPUs >= gfx 11.
205 | void insert_asm_to_split_local_thread_id(IRBuilder<>& Builder, LLVMContext& Ctx,
206 | const BuiltinsCallInstMap& builtins_call_instances, Pal::GfxIpLevel gfx_level) {
207 | assert(gfx_level >= Pal::GfxIpLevel::GfxIp11_0);
208 | // Write local thread id z into v2.
209 | if (!builtins_call_instances[Builtins::workitem_id_z].empty()) {
210 | insert_asm(Builder, Ctx,
211 | "; def v2 local thread id z is in v0[29:20] (v0[31:30] set to 0 by hardware)\n\t"
212 | "V_LSHRREV_B32 v2 20 v0",
213 | "={v2}");
214 | }
215 | // Write local thread id y into v1.
216 | if (!builtins_call_instances[Builtins::workitem_id_y].empty()) {
217 | insert_asm(Builder, Ctx,
218 | "; def v1 local thread id y is in v0[19:10]\n\t"
219 | "V_LSHRREV_B32 v1 10 v0\n\t"
220 | "V_AND_B32 v1 v1 0x3FF",
221 | "={v1}");
222 | }
223 | // Write local thread id x into v0 last to make sure v0 is not overwritten yet.
224 | if (!builtins_call_instances[Builtins::workitem_id_x].empty()) {
225 | insert_asm(Builder, Ctx,
226 | "; def v0 local thread id x is in v0[9:0]\n\t"
227 | "V_AND_B32 v0 v0 0x3FF",
228 | "={v0}");
229 | }
230 | }
231 |
232 | } // namespace
233 |
234 | PreservedAnalyses PalPlatformLowerBuiltinsPass::run(Module& M, ModuleAnalysisManager&) {
235 | Function* entrypoint_fn = find_entrypoint(M);
236 | assert(entrypoint_fn);
237 |
238 | /*
239 | Find all calls to builtins and unique them
240 | -> i.e. every builtin is only called exactly once right at the beginning of the shader.
241 |
242 | for each instruction in entrypoint:
243 | if call to builtin:
244 | record builtin (unique set of used_builtins + all separate calls to them!)
245 | elif call to another function inside this module:
246 | recursively find all calls of used built_ins
247 | else: don't care
248 |
249 | for each used_builtin:
250 | Value* real_builtin = insert inline_asm at beginning of entrypoint
251 | for each call instance of the builtin:
252 | replace all uses of call instance with real_builtin
253 | remove old call instance
254 | */
255 |
256 | BuiltinsCallInstMap builtins_call_instances;
257 | std::unordered_set traversed_functions = {};
258 | find_builtins_calls(&M, entrypoint_fn, builtins_call_instances, traversed_functions);
259 |
260 | LLVMContext& Ctx = M.getContext();
261 | BasicBlock& EntryBlock = *entrypoint_fn->begin();
262 | IRBuilder<> Builder(&*getInsertPt(EntryBlock));
263 |
264 | if (gfx_level_ >= Pal::GfxIpLevel::GfxIp11_0) {
265 | insert_asm_to_split_local_thread_id(Builder, Ctx, builtins_call_instances, gfx_level_);
266 | }
267 |
268 | int builtins_count = static_cast(Builtins::count);
269 | for (int i = 0; i < builtins_count; ++i) {
270 | const Builtins builtin_id = Builtins(i);
271 | const std::vector builtin_call_instances = builtins_call_instances[i];
272 | if (builtin_call_instances.empty()) {
273 | continue;
274 | }
275 |
276 | CallInst* lowered_unique_builtin = nullptr;
277 | switch (builtin_id) {
278 | case Builtins::workgroup_size_x:
279 | lowered_unique_builtin = insert_asm(Builder, Ctx,
280 | ("s_mov_b32 $0, " + std::to_string(tg_dims_[0]) + "; workgroup size x").c_str(), "=s");
281 | break;
282 | case Builtins::workgroup_size_y:
283 | lowered_unique_builtin = insert_asm(Builder, Ctx,
284 | ("s_mov_b32 $0, " + std::to_string(tg_dims_[1]) + "; workgroup size y").c_str(), "=s");
285 | break;
286 | case Builtins::workgroup_size_z:
287 | lowered_unique_builtin = insert_asm(Builder, Ctx,
288 | ("s_mov_b32 $0, " + std::to_string(tg_dims_[2]) + "; workgroup size z").c_str(), "=s");
289 | break;
290 | default:
291 | const auto& assemblyInfo = GetAssemblyInfo(builtin_id);
292 | lowered_unique_builtin =
293 | insert_asm(Builder, Ctx, assemblyInfo.asmString, assemblyInfo.asmConstraints);
294 | }
295 |
296 | for (CallInst* call_to_builtin : builtin_call_instances) {
297 | call_to_builtin->replaceAllUsesWith(lowered_unique_builtin);
298 | }
299 | }
300 |
301 | for (int i = 0; i < static_cast(builtins_count); ++i) {
302 | const std::vector builtin_call_instances = builtins_call_instances[i];
303 | for (CallInst* call_to_builtin : builtin_call_instances) {
304 | call_to_builtin->eraseFromParent();
305 | }
306 | }
307 | // All uncalled functions from the module have to be removed because any kernels other than the one
308 | // marked as entrypoint may contain calls to builtins which have not been resolved by this pass but
309 | // may trip up linkers/relocations. Therefore we set all functions to internal linkage, except the
310 | // known entrypoint. This way, the global dead code elimination pass can remove them for us.
311 | for (Function& F : M) {
312 | if (F.getName().startswith("llvm")) {
313 | // Don't mark llvm intrinsics as internal linkage, otherwise they get
314 | // altered/removed which breaks backend codegen.
315 | continue;
316 | }
317 | F.setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
318 | }
319 | entrypoint_fn->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
320 |
321 | return PreservedAnalyses::none();
322 | }
--------------------------------------------------------------------------------
/src/pal/pal_lower_builtins_pass.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_PLATFORM_LOWER_BUILTINS_H
2 | #define PAL_PLATFORM_LOWER_BUILTINS_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #include
11 | #include
12 |
13 | /// This pass takes care of replacing calls to so-called "builtins" (i.e. local/global thread indices, and similar
14 | /// compute shader builtin values) with the appropriate amdgpu inline assembly that extracts the values from
15 | /// prepopulated SGPRs according to the RDNA2 or RDNA3 ABI. This pass only supports gfx-levels 10 and 11.
16 | struct PalPlatformLowerBuiltinsPass : llvm::PassInfoMixin {
17 | PalPlatformLowerBuiltinsPass(
18 | Pal::GfxIpLevel gfx_level, std::array tg_dims)
19 | : gfx_level_(gfx_level)
20 | , tg_dims_(tg_dims) {}
21 |
22 | PalPlatformLowerBuiltinsPass(const PalPlatformLowerBuiltinsPass& other) = default;
23 | PalPlatformLowerBuiltinsPass& operator=(const PalPlatformLowerBuiltinsPass& other) = default;
24 | PalPlatformLowerBuiltinsPass(PalPlatformLowerBuiltinsPass&& other) = default;
25 | PalPlatformLowerBuiltinsPass& operator=(PalPlatformLowerBuiltinsPass&& other) = default;
26 | ~PalPlatformLowerBuiltinsPass() = default;
27 |
28 | llvm::PreservedAnalyses run(llvm::Module& M, llvm::ModuleAnalysisManager&);
29 |
30 | private:
31 | Pal::GfxIpLevel gfx_level_;
32 | std::array tg_dims_;
33 | };
34 |
35 | #endif // PAL_PLATFORM_LOWER_BUILTINS_H
--------------------------------------------------------------------------------
/src/pal/pal_lower_kernel_arguments_pass.cpp:
--------------------------------------------------------------------------------
1 | #include "pal_lower_kernel_arguments_pass.h"
2 | #include "pal_utils.h"
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | using namespace llvm;
15 |
16 | namespace {
17 | // anonymous namespace to avoid name clashes
18 |
19 | // Function taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
20 | BasicBlock::iterator getInsertPt(BasicBlock& BB) {
21 | BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
22 | for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
23 | AllocaInst* AI = dyn_cast(&*InsPt);
24 |
25 | // If this is a dynamic alloca, the value may depend on the loaded kernargs,
26 | // so loads will need to be inserted before it.
27 | if (!AI || !AI->isStaticAlloca())
28 | break;
29 | }
30 |
31 | return InsPt;
32 | }
33 |
34 | // Function based on AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, Align &MaxAlign)
35 | // Taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
36 | uint64_t getExplicitKernArgSize(const Function& F, Align& MaxAlign) {
37 | assert(F.getCallingConv() == CallingConv::AMDGPU_CS);
38 |
39 | const DataLayout& DL = F.getParent()->getDataLayout();
40 | uint64_t ExplicitArgBytes = 0;
41 | MaxAlign = Align(1);
42 |
43 | for (const Argument& Arg : F.args()) {
44 | const bool IsByRef = Arg.hasByRefAttr();
45 | Type* ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
46 | MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
47 | Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
48 |
49 | uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
50 | ExplicitArgBytes = alignTo(ExplicitArgBytes, ABITypeAlign) + AllocSize;
51 | MaxAlign = std::max(MaxAlign, ABITypeAlign);
52 | }
53 |
54 | return ExplicitArgBytes;
55 | }
56 |
57 | // Function based on AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, Align &MaxAlign)
58 | // Taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
59 | unsigned getKernArgSegmentSize(const Function& F, Align& MaxAlign) {
60 | uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
61 | unsigned ExplicitOffset = 0;
62 | // Being able to dereference past the end is useful for emitting scalar loads.
63 | return alignTo(ExplicitOffset + ExplicitArgBytes, 4);
64 | }
65 | } // namespace
66 |
67 | // Largely based on the function AMDGPULowerKernelArguments::runOnFunction(Function &F)
68 | // taken from llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
69 | // Minor adaptations added to satisfy the AnyDSL PALPlatform requirements.
70 | PreservedAnalyses PalPlatformLowerKernelArgumentsPass::run(Function& F, FunctionAnalysisManager&) {
71 | const auto& funcname = F.getName();
72 | if (funcname != pal_utils::ComputeShaderMainFnName || F.arg_empty()) {
73 | // Only the entry point function's parameters are kernel arguments that need to be lowered.
74 | return PreservedAnalyses::all();
75 | }
76 | assert(F.getCallingConv() == CallingConv::AMDGPU_CS);
77 |
78 | LLVMContext& Ctx = F.getParent()->getContext();
79 | const DataLayout& DL = F.getParent()->getDataLayout();
80 | BasicBlock& EntryBlock = *F.begin();
81 | IRBuilder<> Builder(&*getInsertPt(EntryBlock));
82 |
83 | const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
84 | const uint64_t BaseOffset = 0; // We don't have any data preceding the kernel arguments
85 |
86 | Align MaxAlign;
87 | // TODO: We have to extract that from the Function arguments ourselves!
88 | const uint64_t TotalKernArgSize = getKernArgSegmentSize(F, MaxAlign);
89 | if (TotalKernArgSize == 0)
90 | return PreservedAnalyses::all();
91 |
92 | // Generate Our own ISA to get the pointer to the buffer containing the kernel arguments
93 | // PALPlatform ensures that registers s[2:3] contain this address when the kernel starts execution
94 | std::string asmString = std::string("; def $0 pointer to buffer containing the kernel args is set up in s[2:3]");
95 | // Constraints reference: https://llvm.org/docs/LangRef.html#inline-asm-constraint-string
96 | // This constraint states that our inline assembly returns ("="-prefix indicates constraint for output)
97 | // its result in sgprs 2-3
98 | StringRef constraints = "={s[2:3]}";
99 | ArrayRef inline_asm_args = std::nullopt;
100 |
101 | // Value taken from AMDGPU.h (namespace AMDGPUAS)
102 | // global address space pointing to memory that won't change during execution
103 | unsigned CONSTANT_ADDRESS = 4;
104 | InlineAsm* inline_assembly =
105 | InlineAsm::get(FunctionType::get(Type::getInt8PtrTy(Ctx, CONSTANT_ADDRESS), false), asmString.c_str(),
106 | constraints, true, false, InlineAsm::AD_ATT);
107 | CallInst* KernArgSegment = Builder.CreateCall(inline_assembly, inline_asm_args);
108 |
109 | KernArgSegment->addRetAttr(Attribute::NonNull);
110 | KernArgSegment->addRetAttr(Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
111 | unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
112 |
113 | uint64_t ExplicitArgOffset = 0;
114 |
115 | for (Argument& Arg : F.args()) {
116 | const bool IsByRef = Arg.hasByRefAttr();
117 | Type* ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
118 | MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
119 | Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
120 |
121 | uint64_t Size = DL.getTypeSizeInBits(ArgTy);
122 | uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
123 |
124 | uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
125 | ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
126 |
127 | if (Arg.use_empty())
128 | continue;
129 |
130 | // If this is byval, the loads are already explicit in the function. We just
131 | // need to rewrite the pointer values.
132 | if (IsByRef) {
133 | Value* ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
134 | Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".byval.kernarg.offset");
135 |
136 | Value* CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(ArgOffsetPtr, Arg.getType());
137 | Arg.replaceAllUsesWith(CastOffsetPtr);
138 | continue;
139 | }
140 |
141 | if (PointerType* PT = dyn_cast(ArgTy)) {
142 | // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
143 | // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
144 | // can't represent this with range metadata because it's only allowed for
145 | // integer types.
146 |
147 | // Values taken from AMDGPU.h (namespace AMDGPUAS)
148 | const unsigned REGION_ADDRESS = 2; ///< Address space for region memory. (GDS)
149 | const unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory.
150 | if ((PT->getAddressSpace() == LOCAL_ADDRESS || PT->getAddressSpace() == REGION_ADDRESS))
151 | continue;
152 |
153 | // FIXME: We can replace this with equivalent alias.scope/noalias
154 | // metadata, but this appears to be a lot of work.
155 | if (Arg.hasNoAliasAttr())
156 | continue;
157 | }
158 |
159 | auto* VT = dyn_cast(ArgTy);
160 | bool IsV3 = VT && VT->getNumElements() == 3;
161 | bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
162 |
163 | VectorType* V4Ty = nullptr;
164 |
165 | int64_t AlignDownOffset = alignDown(EltOffset, 4);
166 | int64_t OffsetDiff = EltOffset - AlignDownOffset;
167 | Align AdjustedAlign = commonAlignment(KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
168 |
169 | Value* ArgPtr;
170 | Type* AdjustedArgTy;
171 | if (DoShiftOpt) { // FIXME: Handle aggregate types
172 | // Since we don't have sub-dword scalar loads, avoid doing an extload by
173 | // loading earlier than the argument address, and extracting the relevant
174 | // bits.
175 | //
176 | // Additionally widen any sub-dword load to i32 even if suitably aligned,
177 | // so that CSE between different argument loads works easily.
178 | ArgPtr = Builder.CreateConstInBoundsGEP1_64(Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
179 | Arg.getName() + ".kernarg.offset.align.down");
180 | AdjustedArgTy = Builder.getInt32Ty();
181 | } else {
182 | ArgPtr = Builder.CreateConstInBoundsGEP1_64(
183 | Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".kernarg.offset");
184 | AdjustedArgTy = ArgTy;
185 | }
186 |
187 | if (IsV3 && Size >= 32) {
188 | V4Ty = FixedVectorType::get(VT->getElementType(), 4);
189 | // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
190 | AdjustedArgTy = V4Ty;
191 | }
192 |
193 | ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast");
194 | LoadInst* Load = Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
195 | Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
196 |
197 | MDBuilder MDB(Ctx);
198 |
199 | if (isa(ArgTy)) {
200 | if (Arg.hasNonNullAttr())
201 | Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
202 |
203 | uint64_t DerefBytes = Arg.getDereferenceableBytes();
204 | if (DerefBytes != 0) {
205 | Load->setMetadata(LLVMContext::MD_dereferenceable,
206 | MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
207 | }
208 |
209 | uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
210 | if (DerefOrNullBytes != 0) {
211 | Load->setMetadata(LLVMContext::MD_dereferenceable_or_null,
212 | MDNode::get(
213 | Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefOrNullBytes))));
214 | }
215 |
216 | auto ParamMaybeAlign = Arg.getParamAlign();
217 | if (ParamMaybeAlign.has_value()) {
218 | Load->setMetadata(LLVMContext::MD_align,
219 | MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(
220 | Builder.getInt64Ty(), ParamMaybeAlign.valueOrOne().value()))));
221 | }
222 | }
223 |
224 | // TODO: Convert noalias arg to !noalias
225 |
226 | if (DoShiftOpt) {
227 | Value* ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8);
228 |
229 | IntegerType* ArgIntTy = Builder.getIntNTy(Size);
230 | Value* Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
231 | Value* NewVal = Builder.CreateBitCast(Trunc, ArgTy, Arg.getName() + ".load");
232 | Arg.replaceAllUsesWith(NewVal);
233 | } else if (IsV3) {
234 | Value* Shuf = Builder.CreateShuffleVector(Load, ArrayRef{0, 1, 2}, Arg.getName() + ".load");
235 | Arg.replaceAllUsesWith(Shuf);
236 | } else {
237 | Load->setName(Arg.getName() + ".load");
238 | Arg.replaceAllUsesWith(Load);
239 | }
240 | }
241 |
242 | KernArgSegment->addRetAttr(Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
243 |
244 | return PreservedAnalyses::none();
245 | }
--------------------------------------------------------------------------------
/src/pal/pal_lower_kernel_arguments_pass.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H
2 | #define PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H
3 |
4 | #include
5 |
6 | /// This pass replaces accesses to kernel arguments with loads from offsets from a manually supplied buffer
7 | /// containing these arguments. The pointer to this buffer is expected to be prepopulated into specific sgprs
8 | /// by the PALPlatform.
9 | ///
10 | /// This pass is an almost 1:1 replicate of the AMDGPULowerKernelArguments pass
11 | /// (llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp)
12 | struct PalPlatformLowerKernelArgumentsPass : llvm::PassInfoMixin {
13 | PalPlatformLowerKernelArgumentsPass(){}
14 |
15 | PalPlatformLowerKernelArgumentsPass(const PalPlatformLowerKernelArgumentsPass& other) = default;
16 | PalPlatformLowerKernelArgumentsPass& operator=(const PalPlatformLowerKernelArgumentsPass& other) = default;
17 | PalPlatformLowerKernelArgumentsPass(PalPlatformLowerKernelArgumentsPass&& other) = default;
18 | PalPlatformLowerKernelArgumentsPass& operator=(PalPlatformLowerKernelArgumentsPass&& other) = default;
19 | ~PalPlatformLowerKernelArgumentsPass() = default;
20 |
21 | llvm::PreservedAnalyses run(llvm::Function& F, llvm::FunctionAnalysisManager& FAM);
22 | };
23 |
24 | #endif // PAL_PLATFORM_LOWER_KERNEL_ARGUMENTS_H
--------------------------------------------------------------------------------
/src/pal/pal_utils.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_UTILS_H
2 | #define PAL_UTILS_H
3 |
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | namespace pal_utils {
15 |
16 | std::string llvm_diagnostic_to_string(const llvm::SMDiagnostic& diagnostic_err);
17 |
18 | struct ShaderSrc {
19 | const std::string kernelname;
20 | const std::string src_code;
21 | const std::string filename;
22 | const llvm::Function* function;
23 | llvm::LLVMContext llvm_context;
24 | std::unique_ptr llvm_module;
25 | llvm::SMDiagnostic diagnostic_err;
26 |
27 | ShaderSrc(const std::string& filename, const std::string& src_code, const std::string& kernelname);
28 | bool rename_entry_point();
29 | };
30 |
31 | // Create the metadata that PAL expects to be attached to a kernel/shader binary.
32 | llvm::msgpack::Document build_metadata(const ShaderSrc& shader_src, Pal::GfxIpLevel gfx_level,
33 | const std::array& thread_group_dimensions, uint32_t wavefront_size);
34 |
35 | const char* get_gpu_name(const Pal::AsicRevision asic_revision);
36 |
37 | const char* get_gfx_isa_id(const Pal::GfxIpLevel gfxip_level);
38 |
39 | bool isAMDGPUEntryFunctionCC(llvm::CallingConv::ID CC);
40 |
41 | void write_to_memory(
42 | Pal::IGpuMemory* dst_memory, int64_t dst_memory_offset, const void* src_data, int64_t size);
43 | void read_from_memory(void* dst_buffer, Pal::IGpuMemory* src_memory, int64_t src_memory_offset, int64_t size);
44 |
45 | // Returns a gpu-local memory heap that fits memory_size.
46 | // Order of importance: 1.GpuHeapInvisible, 2.GpuHeapLocal
47 | // Returns Pal::GpuHeap::GpuHeapCount if no appropriate heap can be found.
48 | Pal::GpuHeap find_gpu_local_heap(const Pal::IDevice* device, Pal::gpusize memory_size);
49 |
50 | bool allocation_is_host_visible(Pal::IGpuMemory* gpu_allocation);
51 |
52 | llvm::MDNode* get_metadata_mdnode(const llvm::Function* func, const char* key, int index = 0);
53 | llvm::StringRef get_metadata_string(const llvm::Function* func, const char* key);
54 | uint64_t get_metadata_uint(const llvm::Function* func, const char* key, int index = 0);
55 |
56 | extern const char* ComputeShaderMainFnName;
57 |
58 | } // namespace pal_utils
59 |
60 | #define CHECK_PAL(err, name) { if (err != Pal::Result::Success) { error("PAL API function % [file %, line %]: %", name, __FILE__, __LINE__, static_cast(err)); } }
61 |
62 | #endif
63 |
--------------------------------------------------------------------------------
/src/pal_platform.h:
--------------------------------------------------------------------------------
1 | #ifndef PAL_PLATFORM_H
2 | #define PAL_PLATFORM_H
3 |
4 | #include "pal/pal_device.h"
5 | #include "pal/pal_utils.h"
6 | #include "platform.h"
7 | #include "runtime.h"
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 |
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 |
21 | #ifdef AnyDSL_runtime_HAS_LLVM_SUPPORT
22 | #include
23 | #endif
24 |
25 | class PALPlatform : public Platform {
26 | public:
27 | PALPlatform(Runtime* runtime);
28 | ~PALPlatform();
29 |
30 | protected:
31 | void* alloc(DeviceId dev, int64_t size) override;
32 | void* alloc_host(DeviceId dev, int64_t size) override;
33 | void* alloc_unified(DeviceId dev, int64_t size) override;
34 | void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
35 | void release(DeviceId dev, void* ptr) override;
36 | void release_host(DeviceId dev, void* ptr) override;
37 |
38 | void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
39 |
40 | void synchronize(DeviceId dev) override;
41 |
42 | void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst,
43 | int64_t size) override;
44 | void copy_from_host(
45 | const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override;
46 | void copy_to_host(
47 | DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
48 |
49 | size_t dev_count() const override { return devices_.size(); }
50 | std::string name() const override { return "PAL"; }
51 | const char* device_name(DeviceId dev) const override;
52 | bool device_check_feature_support(DeviceId, const char*) const override { return false; }
53 |
54 | Pal::IPipeline* load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);
55 | std::string compile_gcn(DeviceId dev, pal_utils::ShaderSrc&& shader_src) const;
56 | std::string emit_gcn(pal_utils::ShaderSrc&& shader_src, const std::string& cpu,
57 | Pal::GfxIpLevel gfx_level, llvm::OptimizationLevel opt) const;
58 |
59 | protected:
60 | Pal::IPlatform* platform_;
61 | std::vector