├── CMakeLists.txt ├── CONTRIBUTORS.md ├── INSTALL.md ├── LICENSE ├── README.md ├── cmake └── common.cmake ├── env.sh ├── resource ├── Makefile ├── README.md ├── alexnet │ ├── alexnet.be.cu │ ├── alexnet.cu │ ├── alexnet.json │ ├── alexnet.profile.json │ └── alexnet.trans.cu ├── bert │ ├── bert.be.cu │ ├── bert.cu │ ├── bert.json │ ├── bert.profile.json │ └── bert.trans.cu ├── densenet │ ├── densenet.be.cu │ ├── densenet.cu │ ├── densenet.json │ ├── densenet.profile.json │ └── densenet.trans.cu ├── inception │ ├── inception.be.cu │ ├── inception.cu │ ├── inception.json │ ├── inception.profile.json │ └── inception.trans.cu ├── mobilenet │ ├── mobilenet.be.cu │ ├── mobilenet.cu │ ├── mobilenet.json │ ├── mobilenet.profile.json │ └── mobilenet.trans.cu ├── mocked_kernel │ ├── mocked_kernel.be.cu │ ├── mocked_kernel.cu │ ├── mocked_kernel.json │ ├── mocked_kernel.profile.json │ └── mocked_kernel.trans.cu ├── resnet │ ├── resnet.be.cu │ ├── resnet.cu │ ├── resnet.json │ ├── resnet.profile.json │ └── resnet.trans.cu ├── resnet152 │ ├── resnet152.be.cu │ ├── resnet152.cu │ ├── resnet152.json │ ├── resnet152.profile.json │ └── resnet152.trans.cu ├── resnet18 │ ├── resnet18.be.cu │ ├── resnet18.cu │ ├── resnet18.json │ ├── resnet18.param │ ├── resnet18.profile.json │ └── resnet18.trans.cu └── vgg │ ├── vgg.be.cu │ ├── vgg.cu │ ├── vgg.json │ ├── vgg.profile.json │ └── vgg.trans.cu ├── script ├── best_effort_kernel.py ├── estimate_max_throughput.py ├── estimate_resource_usage.py ├── generate_asm_loop.py ├── generate_final_schedule.py ├── generate_register_hint.py ├── generate_shared_memory_usage.py ├── get_kernel_descriptor.py ├── get_kernel_occupancy.py ├── replace_raw_occupancy.py ├── replace_register_usage.py ├── transform_kernel.py └── tvm_generate_model.py └── src ├── example ├── rpc_client.cpp ├── rpc_client_cont.cpp └── rpc_server.cpp └── reef ├── client ├── client.cpp └── client.h ├── executor ├── executor_base.cpp ├── executor_base.h ├── hip │ ├── hip_impl.cpp │ └── hip_impl.h ├── hybrid_executor.cpp ├── hybrid_executor.h ├── model.cpp ├── model.h ├── trans_executor.cpp └── trans_executor.h ├── protos └── reef.proto ├── rpc └── placeholder ├── server ├── scheduler.cpp ├── scheduler.h ├── server.cpp └── server.h ├── test └── test.cpp └── util ├── common.h ├── json.cpp ├── json.h ├── shared_memory.cpp ├── shared_memory.h └── threadsafe_queue.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(reef) 2 | cmake_minimum_required(VERSION 3.2) 3 | 4 | add_compile_options(-std=c++11) 5 | add_definitions(-D__REEF_HIP_GPU__) 6 | add_definitions(-DRESOURCE_DIR="${CMAKE_CURRENT_LIST_DIR}/resource") 7 | 8 | 9 | # GRPC and Protocol Buffers libraries location 10 | list(APPEND CMAKE_PREFIX_PATH "/opt/rocm") 11 | 12 | # Cmake find modules 13 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") 14 | 15 | # For grpc 16 | 17 | include("${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake") 18 | 19 | get_filename_component(reef_proto "${CMAKE_CURRENT_LIST_DIR}/src/reef/protos/reef.proto" ABSOLUTE) 20 | get_filename_component(reef_proto_path "${reef_proto}" PATH) 21 | set(GRPC_GENERATE_DIR "${PROJECT_SOURCE_DIR}/src/reef/rpc" ) 22 | set(reef_proto_srcs "${GRPC_GENERATE_DIR}/reef.pb.cc") 23 | set(reef_proto_hdrs "${GRPC_GENERATE_DIR}/reef.pb.h") 24 | set(reef_grpc_srcs "${GRPC_GENERATE_DIR}/reef.grpc.pb.cc") 25 | set(reef_grpc_hdrs "${GRPC_GENERATE_DIR}/reef.grpc.pb.h") 26 | add_custom_command( 27 | OUTPUT "${reef_proto_srcs}" "${reef_proto_hdrs}" "${reef_grpc_srcs}" "${reef_grpc_hdrs}" 28 | COMMAND ${_PROTOBUF_PROTOC} 29 | ARGS --grpc_out "${GRPC_GENERATE_DIR}" 30 | --cpp_out "${GRPC_GENERATE_DIR}" 31 | -I "${reef_proto_path}" 32 | --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" 33 | "${reef_proto}" 34 | DEPENDS "${reef_proto}") 35 | 36 | include_directories("${GRPC_GENERATE_DIR}") 37 | 38 | # reef_grpc_proto 39 | add_library(reef_grpc_proto 40 | ${reef_grpc_srcs} 41 | ${reef_grpc_hdrs} 42 | ${reef_proto_srcs} 43 | ${reef_proto_hdrs}) 44 | target_link_libraries(reef_grpc_proto 45 | ${_REFLECTION} 46 | ${_GRPC_GRPCPP} 47 | ${_PROTOBUF_LIBPROTOBUF}) 48 | 49 | 50 | # REEF codes 51 | 52 | find_package(hip REQUIRED) 53 | 54 | find_package(GTest REQUIRED) 55 | 56 | find_package(glog REQUIRED) 57 | 58 | SET(CMAKE_CXX_COMPILER "/opt/rocm/bin/hipcc") 59 | 60 | 61 | set(CMAKE_CXX_FLAGS "-g -O0 ${CMAKE_CXX_FLAGS}") 62 | 63 | include_directories(${HIP_INCLUDE_DIRS}) 64 | 65 | include_directories("${PROJECT_SOURCE_DIR}/src") 66 | 67 | 68 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/client" client) 69 | 70 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/executor" executor) 71 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/executor/hip" hip_impl) 72 | 73 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/server" server) 74 | 75 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/util" util) 76 | 77 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/test" test) 78 | 79 | add_library(reef_util 80 | ${util} 81 | ) 82 | 83 | add_library(reef_server_lib 84 | ${server} 85 | ${executor} 86 | ${hip_impl} 87 | ) 88 | 89 | add_library(reef_client_lib 90 | ${client} 91 | ) 92 | 93 | add_executable(unit_test 94 | ${test} 95 | ) 96 | 97 | target_link_libraries(unit_test 98 | reef_util 99 | reef_server_lib 100 | reef_client_lib 101 | reef_grpc_proto 102 | glog::glog 103 | ${GTEST_BOTH_LIBRARIES} 104 | ${_REFLECTION} 105 | ${_GRPC_GRPCPP} 106 | ${_PROTOBUF_LIBPROTOBUF} 107 | pthread 108 | ) 109 | 110 | function (add_executable_app app_name app_path) 111 | add_executable(${app_name} 112 | ${app_path} 113 | ) 114 | target_link_libraries(${app_name} 115 | reef_util 116 | reef_server_lib 117 | reef_client_lib 118 | reef_grpc_proto 119 | glog::glog 120 | ${GTEST_BOTH_LIBRARIES} 121 | ${_REFLECTION} 122 | ${_GRPC_GRPCPP} 123 | ${_PROTOBUF_LIBPROTOBUF} 124 | pthread 125 | ) 126 | endfunction() 127 | 128 | add_executable_app(reef_client "${PROJECT_SOURCE_DIR}/src/example/rpc_client.cpp") 129 | add_executable_app(reef_client_cont "${PROJECT_SOURCE_DIR}/src/example/rpc_client_cont.cpp") 130 | add_executable_app(reef_server "${PROJECT_SOURCE_DIR}/src/example/rpc_server.cpp") 131 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # REEF Contributors 2 | 3 | **Mingcong Han**, mingconghan@sjtu.edu.cn, *Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University; Shanghai AI Laboratory;* 4 | 5 | **Hanze Zhang**, hanzezhang@sjtu.edu.cn, *Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University; Shanghai AI Laboratory;* 6 | 7 | **Rong Chen**, rongchen@sjtu.edu.cn, *Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University; Shanghai AI Laboratory;* 8 | 9 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # REEF Installation 2 | 3 | ## Software Version 4 | * Ubuntu 18.04 5 | * ROCm 4.3.0 6 | * CMake > 3.18 7 | * grpc 1.45 8 | * glog 0.6.0 9 | * googletest 1.11.0 10 | 11 | ## Installation Overview 12 | 13 | The installation has six major steps: 14 | 1. Install ROCm-4.3 15 | 2. Install the customized GPU kernel driver (for reset-based preemption), and reboot 16 | 3. (Recommended, but Optional) create the ROCm docker container 17 | 4. Install the customized GPU runtime (hip, rocclr) 18 | 5. Install other software dependencies (e.g., grpc) 19 | 6. Build REEF 20 | 21 | The customized GPU kernel driver and GPU runtime can be found [here](https://github.com/SJTU-IPADS/reef-artifacts/tree/master/reef-env). 22 | 23 | ## Install Dependencies 24 | 25 | ### Install ROCm-4.3 26 | ```sh 27 | # Ensure the system is up to date. 28 | $ sudo apt update 29 | $ sudo apt dist-upgrade 30 | $ sudo apt install libnuma-dev 31 | $ sudo reboot 32 | 33 | # Add the ROCm apt repository. 34 | $ sudo apt install wget gnupg2 35 | $ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - 36 | $ echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.3/ ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list 37 | $ sudo apt update 38 | 39 | # Install the ROCm package and reboot. 40 | $ sudo apt install rocm-dkms && sudo reboot 41 | 42 | # Add ROCm binaries to PATH. 43 | $ echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/rocprofiler/bin:/opt/rocm/opencl/bin' | sudo tee -a /etc/profile.d/rocm.sh 44 | ``` 45 | 46 | 47 | ### Build & Install the Customized Kernel Driver 48 | ```sh 49 | $ git clone https://github.com/SJTU-IPADS/reef-artifacts.git 50 | $ cd reef-artifacts/reef-env/amdgpu-dkms 51 | # Notice: The script will reboot 52 | $ ./update-kern-module.sh 53 | ``` 54 | 55 | ### Build & Install rocclr 56 | ```sh 57 | # in reef-artifacts/reef-env 58 | $ export REEF_ENV_ROOT=`pwd` 59 | $ cd rocclr 60 | $ mkdir build 61 | $ cd build 62 | $ cmake -DOPENCL_DIR="${REEF_ENV_ROOT}/ROCm-OpenCL-Runtime" -DCMAKE_INSTALL_PREFIX=/opt/rocm/rocclr .. 63 | $ sudo make install 64 | ``` 65 | 66 | ### Build & Install hip 67 | ```sh 68 | # in reef-artifacts/reef-env 69 | $ export REEF_ENV_ROOT=`pwd` 70 | $ cd hip 71 | $ mkdir build 72 | $ cd build 73 | $ cmake -DCMAKE_PREFIX_PATH="${REEF_ENV_ROOT}/rocclr/build;/opt/rocm/hip" .. 74 | $ sudo make install 75 | ``` 76 | 77 | ### Install CMake 78 | ```sh 79 | $ wget https://github.com/Kitware/CMake/releases/download/v3.22.4/cmake-3.22.4-linux-x86_64.sh 80 | $ sh cmake-3.22.4-linux-x86_64 81 | # you can also add this cmake version to ~/.bashrc 82 | $ export PATH=~/cmake-3.22.4-linux-x86_64/bin:$PATH 83 | $ cmake --version 84 | cmake version 3.22.4 85 | ``` 86 | 87 | ### Install glog 88 | ```sh 89 | $ git clone https://github.com/google/glog 90 | $ cd glog 91 | $ mkdir build; cd build 92 | $ cmake .. 93 | $ sudo make install 94 | ``` 95 | 96 | ### Install gtest 97 | ```sh 98 | $ git clone -b https://github.com/google/googletest 99 | $ cd googletest 100 | $ mkdir build; cd build 101 | $ cmake .. 102 | $ sudo make install 103 | ``` 104 | 105 | ### Install grpc + protobuf 106 | ```sh 107 | $ git clone -b 1.45.0 https://github.com/grpc/grpc 108 | $ cd grpc 109 | $ git submodule update --init 110 | $ mkdir -p cmake/build; cd cmake/build 111 | $ cmake ../.. 112 | $ sudo make install 113 | ``` 114 | 115 | 116 | ## Build REEF 117 | 118 | ### Build Resource 119 | This step compiles the DNN models' device code. 120 | ```sh 121 | $ cd resource 122 | $ make 123 | ``` 124 | 125 | ### Build REEF 126 | ```sh 127 | $ mkdir build; cd build 128 | $ cmake .. 129 | $ make -j4 130 | ``` 131 | 132 | ### Run tests 133 | ```sh 134 | # in ./build 135 | $ ./unit_test 136 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # REEF - Real-time GPU-accelerated DNN Inference Scheduling System 2 | 3 | REEF is a real-time GPU-accelerated DNN inference scheduling system that supports instant kernel preemption and controlled concurrent execution in GPU scheduling. 4 | 5 | ## Table of Contents 6 | 7 | - [Introduction](#introduction) 8 | - [Paper](#paper) 9 | - [REEF Example](#reef-example) 10 | - [Project Structure](#project-structure) 11 | - [Hardware Requirement](#hardware-requirement) 12 | - [Installation](#installation) 13 | - [Artifact Evaluation](#artifact-evaluation) 14 | 15 | 16 | ## Introduction 17 | 18 | REEF is a real-time GPU-accelerated DNN inference scheduling system. 19 | REEF divides DNN inference tasks into two priorities: *real-time tasks(RT tasks)* and *best-effort tasks(BE tasks)*. 20 | The scheduling goal of REEF is to minimize the latency of RT task and improve the throughput as much as possible. 21 | 22 | REEF achieves such goal by providing two key techniques: 23 | 24 | * *Reset-based Preemption:* BE tasks can be preempted in a few microseconds once a RT task arrives. The preemption is achieved by just killing 25 | the running BE kernels and clearing the queued BE kernels, which is bases on the *idempotence* of DNN inference kernel. 26 | 27 | * *Dynamic Kernel Padding(DKP):* BE tasks can be co-executed with the RT task by only using the CUs leftover from the RT kernels. This approach can improve the throughput and avoid starvation of BE tasks with minimal latency overhead on RT tasks. 28 | 29 | ## REEF Example 30 | 31 | After [building REEF](INSTALL.md), the example below can show how REEF works when there are concurrent tasks (one RT and multiple BEs). 32 | 33 | First, start a REEF server. 34 | ```bash 35 | # in ./build 36 | $ ./reef_server 37 | ``` 38 | 39 | Then, start multiple BE clients. 40 | ```bash 41 | # in ./build 42 | $ for i in {1..4}; do ./reef_client_cont ../resource/resnet152 resnet152 0 0 & done 43 | ``` 44 | You can see 4 BE clients are submitting BE tasks concurrently, the client will echo the inference latency of each task, e.g.: 45 | ``` 46 | client 3 inference latency: 16.567 ms 47 | client 2 inference latency: 29.347 ms 48 | client 1 inference latency: 32.506 ms 49 | client 0 inference latency: 24.848 ms 50 | ``` 51 | 52 | Then, start a RT client, which submitting requests without pause. 53 | ```bash 54 | # in ./build 55 | $ ./reef_client_cont ../resource/resnet152 resnet152 1 0 56 | ``` 57 | 58 | You can see the RT client has the lowest inference latency. 59 | ``` 60 | ... 61 | client 4 inference latency: 12.743 ms 62 | client 4 inference latency: 12.608 ms 63 | client 4 inference latency: 12.944 ms 64 | client 4 inference latency: 12.637 ms 65 | ``` 66 | 67 | While, the BE task can still execute concurrently with RT task without affecting the performance of RT tasks. 68 | ``` 69 | ... 70 | client 2 inference latency: 48.183 ms 71 | client 1 inference latency: 68.599 ms 72 | client 0 inference latency: 34.857 ms 73 | client 3 inference latency: 43.565 ms 74 | ``` 75 | 76 | 77 | 78 | 79 | ## Project Structure 80 | ``` 81 | > tree . 82 | ├── cmake 83 | ├── resource # DNN model resources for the evaluations 84 | │ ├── resnet # DNN model for ResNet 85 | │ │ ├── resnet.json # The schedule graph (meta data) of the DNN model 86 | │ │ ├── resnet.cu # The raw GPU device code (GPU kernels) for the DNN model 87 | │ │ ├── resnet.trans.cu # The transformed GPU device code which supports dynamic kernel padding 88 | │ │ ├── resnet.be.cu # The transformed GPU devide code which supports reset-based preemption 89 | │ │ ├── resnet.profile.json # The profile of the kernel execution time 90 | │ ├── densenet 91 | │ ├── inception 92 | ├── script # Utility scripts 93 | └── src # source code 94 | │ ├── example # REEF examples 95 | │ └── reef # REEF source code 96 | └── env.sh # Environment variables 97 | │ 98 | ``` 99 | 100 | ## Hardware Requirement 101 | 102 | Currently, REEF only supports **AMD Radeon Instinct MI50 GPU**. 103 | 104 | 105 | ## Installation 106 | 107 | see [INSTALL](INSTALL.md). 108 | 109 | 110 | ## Artifact Evaluation 111 | 112 | For OSDI'22 atrifact evaluation, see [reef-artifacts](https://github.com/SJTU-IPADS/reef-artifacts). 113 | 114 | ## Paper 115 | If you use REEF in your research, please cite our paper: 116 | ```bibtex 117 | @inproceedings {osdi2022reef, 118 | author = {Mingcong Han and Hanze Zhang and Rong Chen and Haibo Chen}, 119 | title = {Microsecond-scale Preemption for Concurrent {GPU-accelerated} {DNN} Inferences}, 120 | booktitle = {16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)}, 121 | year = {2022}, 122 | isbn = {978-1-939133-28-1}, 123 | address = {Carlsbad, CA}, 124 | pages = {539--558}, 125 | url = {https://www.usenix.org/conference/osdi22/presentation/han}, 126 | publisher = {USENIX Association}, 127 | month = jul, 128 | } 129 | ``` 130 | 131 | ## The Team 132 | 133 | REEF is developed and maintained by members from [IPADS@SJTU](https://github.com/SJTU-IPADS) and Shanghai AI Laboratory. See [Contributors](CONTRIBUTORS.md). 134 | 135 | 136 | ## License 137 | 138 | REEF uses [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0.html). 139 | -------------------------------------------------------------------------------- /cmake/common.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2018 gRPC authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # cmake build file for C++ route_guide example. 16 | # Assumes protobuf and gRPC have been installed using cmake. 17 | # See cmake_externalproject/CMakeLists.txt for all-in-one cmake build 18 | # that automatically builds all the dependencies before building route_guide. 19 | 20 | cmake_minimum_required(VERSION 3.5.1) 21 | 22 | set (CMAKE_CXX_STANDARD 11) 23 | 24 | if(MSVC) 25 | add_definitions(-D_WIN32_WINNT=0x600) 26 | endif() 27 | 28 | find_package(Threads REQUIRED) 29 | 30 | if(GRPC_AS_SUBMODULE) 31 | # One way to build a projects that uses gRPC is to just include the 32 | # entire gRPC project tree via "add_subdirectory". 33 | # This approach is very simple to use, but the are some potential 34 | # disadvantages: 35 | # * it includes gRPC's CMakeLists.txt directly into your build script 36 | # without and that can make gRPC's internal setting interfere with your 37 | # own build. 38 | # * depending on what's installed on your system, the contents of submodules 39 | # in gRPC's third_party/* might need to be available (and there might be 40 | # additional prerequisites required to build them). Consider using 41 | # the gRPC_*_PROVIDER options to fine-tune the expected behavior. 42 | # 43 | # A more robust approach to add dependency on gRPC is using 44 | # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt). 45 | 46 | # Include the gRPC's cmake build (normally grpc source code would live 47 | # in a git submodule called "third_party/grpc", but this example lives in 48 | # the same repository as gRPC sources, so we just look a few directories up) 49 | add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL) 50 | message(STATUS "Using gRPC via add_subdirectory.") 51 | 52 | # After using add_subdirectory, we can now use the grpc targets directly from 53 | # this build. 54 | set(_PROTOBUF_LIBPROTOBUF libprotobuf) 55 | set(_REFLECTION grpc++_reflection) 56 | if(CMAKE_CROSSCOMPILING) 57 | find_program(_PROTOBUF_PROTOC protoc) 58 | else() 59 | set(_PROTOBUF_PROTOC $) 60 | endif() 61 | set(_GRPC_GRPCPP grpc++) 62 | if(CMAKE_CROSSCOMPILING) 63 | find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin) 64 | else() 65 | set(_GRPC_CPP_PLUGIN_EXECUTABLE $) 66 | endif() 67 | elseif(GRPC_FETCHCONTENT) 68 | # Another way is to use CMake's FetchContent module to clone gRPC at 69 | # configure time. This makes gRPC's source code available to your project, 70 | # similar to a git submodule. 71 | message(STATUS "Using gRPC via add_subdirectory (FetchContent).") 72 | include(FetchContent) 73 | FetchContent_Declare( 74 | grpc 75 | GIT_REPOSITORY https://github.com/grpc/grpc.git 76 | # when using gRPC, you will actually set this to an existing tag, such as 77 | # v1.25.0, v1.26.0 etc.. 78 | # For the purpose of testing, we override the tag used to the commit 79 | # that's currently under test. 80 | GIT_TAG vGRPC_TAG_VERSION_OF_YOUR_CHOICE) 81 | FetchContent_MakeAvailable(grpc) 82 | 83 | # Since FetchContent uses add_subdirectory under the hood, we can use 84 | # the grpc targets directly from this build. 85 | set(_PROTOBUF_LIBPROTOBUF libprotobuf) 86 | set(_REFLECTION grpc++_reflection) 87 | set(_PROTOBUF_PROTOC $) 88 | set(_GRPC_GRPCPP grpc++) 89 | if(CMAKE_CROSSCOMPILING) 90 | find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin) 91 | else() 92 | set(_GRPC_CPP_PLUGIN_EXECUTABLE $) 93 | endif() 94 | else() 95 | # This branch assumes that gRPC and all its dependencies are already installed 96 | # on this system, so they can be located by find_package(). 97 | 98 | # Find Protobuf installation 99 | # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation. 100 | set(protobuf_MODULE_COMPATIBLE TRUE) 101 | find_package(Protobuf CONFIG REQUIRED) 102 | message(STATUS "Using protobuf ${Protobuf_VERSION}") 103 | 104 | set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) 105 | set(_REFLECTION gRPC::grpc++_reflection) 106 | if(CMAKE_CROSSCOMPILING) 107 | find_program(_PROTOBUF_PROTOC protoc) 108 | else() 109 | set(_PROTOBUF_PROTOC $) 110 | endif() 111 | 112 | # Find gRPC installation 113 | # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation. 114 | find_package(gRPC CONFIG REQUIRED) 115 | message(STATUS "Using gRPC ${gRPC_VERSION}") 116 | 117 | set(_GRPC_GRPCPP gRPC::grpc++) 118 | if(CMAKE_CROSSCOMPILING) 119 | find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin) 120 | else() 121 | set(_GRPC_CPP_PLUGIN_EXECUTABLE $) 122 | endif() 123 | endif() 124 | -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib 2 | export PATH=$PATH:`pwd`/build 3 | export REEF_RESOURCE_DIR=`pwd`/resource 4 | -------------------------------------------------------------------------------- /resource/Makefile: -------------------------------------------------------------------------------- 1 | # NVCC= nvcc 2 | 3 | # INCLUDES=-I/usr/local/cuda/include -I/opt/rocm/hip/include 4 | 5 | # LIBS=-lcuda 6 | 7 | # ARCH=70 8 | 9 | CC=hipcc 10 | 11 | INCLUDES= -I/opt/rocm/hip/include 12 | 13 | SUBDIRS = ${shell ls -d */ | sed 's/\///' -} 14 | 15 | 16 | CURRENT_PATH = ${shell pwd} 17 | 18 | CURRENT_DIR = ${shell basename $(CURRENT_PATH)} 19 | 20 | ARCH=gfx906 21 | 22 | all: 23 | @for dir in $(SUBDIRS); do \ 24 | make -C $$dir -f ../Makefile build_subdir;\ 25 | done 26 | 27 | transform: 28 | @for dir in $(SUBDIRS); do \ 29 | make -C $$dir -f ../Makefile transform_subdir;\ 30 | done 31 | 32 | profile: 33 | for dir in $(SUBDIRS); do \ 34 | echo "profiling $$dir"; \ 35 | profiler $$dir;\ 36 | done 37 | 38 | build_subdir: ${CURRENT_DIR}.raw.co ${CURRENT_DIR}.trans.co ${CURRENT_DIR}.be.co 39 | 40 | transform_subdir: ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s 41 | python3 ../../script/transform_kernel.py ${CURRENT_DIR}.cu ${CURRENT_DIR}.json ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s 42 | python3 ../../script/best_effort_kernel.py ${CURRENT_DIR}.cu 43 | 44 | 45 | 46 | %.be.cu: %.cu 47 | python3 ../../script/best_effort_kernel.py ${CURRENT_DIR}.cu 48 | 49 | %.preempt.cu: %.cu 50 | python3 ../../script/preemptable_kernel.py ${CURRENT_DIR}.cu ${CURRENT_DIR}.json 51 | 52 | %.trans.cu: %.cu ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s 53 | python3 ../../script/transform_kernel.py ${CURRENT_DIR}.cu ${CURRENT_DIR}.json ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s 54 | 55 | %-hip-amdgcn-amd-amdhsa-gfx906.s: %.cu 56 | $(CC) $(INCLUDES) --save-temps --genco --offload-arch=${ARCH} $< -o $*.raw.co 57 | rm *.bc *.cui *.o *.out *.txt 58 | 59 | %.raw.co: %.cu 60 | $(CC) $(INCLUDES) --save-temps --genco --offload-arch=${ARCH} $< -o $@ 61 | rm *.bc *.cui *.o *.out *.txt 62 | 63 | %.be.co: %.be.cu 64 | $(CC) $(INCLUDES) --genco --offload-arch=${ARCH} $< -o $@ 65 | 66 | %.trans.co: %.trans.cu 67 | @"/opt/rocm/llvm/bin/clang" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -E -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /opt/rocm-4.3.0/llvm/lib/clang/13.0.0 -internal-isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include/cuda_wrappers -internal-isystem /opt/rocm-4.3.0/include -include __clang_hip_runtime_wrapper.h -isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include/.. -isystem /opt/rocm-4.3.0/hsa/include -isystem /opt/rocm-4.3.0/hip/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++11 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -cuid=8a385692c0a935c2 -fcuda-allow-variadic-functions -munsafe-fp-atomics -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui -x hip $< 68 | @"/opt/rocm/llvm/bin/clang" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -emit-llvm-bc -emit-llvm-uselists -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /opt/rocm-4.3.0/llvm/lib/clang/13.0.0 -O3 -std=c++11 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -disable-llvm-passes -cuid=8a385692c0a935c2 -fcuda-allow-variadic-functions -munsafe-fp-atomics -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc -x hip-cpp-output $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui 69 | @"/opt/rocm/llvm/bin/clang" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -S -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /opt/rocm-4.3.0/llvm/lib/clang/13.0.0 -O3 -std=c++11 -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -cuid=8a385692c0a935c2 -fcuda-allow-variadic-functions -munsafe-fp-atomics -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s -x ir $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc 70 | @python3 ../../script/replace_register_usage.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s 71 | python3 ../../script/generate_asm_loop.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s $*.json 72 | @"/opt/rocm/llvm/bin/clang" -cc1as -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -filetype obj -main-file-name $< -target-cpu gfx906 -dwarf-version=4 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s 73 | @"/opt/rocm/llvm/bin/lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -save-temps -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.out $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o 74 | @"/opt/rocm/llvm/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906 -inputs=/dev/null,$*.trans-hip-amdgcn-amd-amdhsa-gfx906.out -outputs=$@ 75 | @echo build $@ 76 | rm *.bc *.cui *.o *.out *.txt 77 | 78 | # @"/opt/rocm/llvm/bin/clang-13" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -E -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /opt/rocm/llvm/lib/clang/12.0.0 -internal-isystem /opt/rocm/llvm/lib/clang/12.0.0/include/cuda_wrappers -internal-isystem /opt/rocm/include -include __clang_hip_runtime_wrapper.h -isystem /opt/rocm/llvm/lib/clang/12.0.0/include/.. -isystem /opt/rocm/hsa/include -isystem /opt/rocm/hip/include -D __HIP_ROCclr__ -D __HIP_ROCclr__ -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/llvm/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /opt/rocm/llvm/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++11 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -fcuda-allow-variadic-functions -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui -x hip $< 79 | # @"/opt/rocm/llvm/bin/clang-13" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -emit-llvm-bc -emit-llvm-uselists -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /opt/rocm/llvm/lib/clang/12.0.0 -O3 -std=c++11 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -disable-llvm-passes -fcuda-allow-variadic-functions -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc -x hip-cpp-output $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui 80 | # @"/opt/rocm/llvm/bin/clang-13" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -S -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /opt/rocm/llvm/lib/clang/12.0.0 -O3 -std=c++11 -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -fcuda-allow-variadic-functions -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s -x ir $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc 81 | # @python3 ../../script/replace_register_usage.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s 82 | # python3 ../../script/generate_asm_loop.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s $*.json 83 | # @"/opt/rocm/llvm/bin/clang-13" -cc1as -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -filetype obj -main-file-name $< -target-cpu gfx906 -dwarf-version=4 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s 84 | # @"/opt/rocm/llvm/bin/lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -save-temps -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.out $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o 85 | # @"/opt/rocm/llvm/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906 -inputs=/dev/null,$*.trans-hip-amdgcn-amd-amdhsa-gfx906.out -outputs=$@ 86 | # @echo build $@ 87 | # rm *.bc *.cui *.o *.out *.txt 88 | 89 | %.asm: %-hip-amdgcn-amd-amdhsa-gfx906.s 90 | "/opt/rocm/llvm/bin/clang" -cc1as -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -filetype obj -main-file-name $< -target-cpu gfx906 -dwarf-version=4 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s 91 | "/opt/rocm/llvm/bin/lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -save-temps -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.out $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o 92 | "/opt/rocm/llvm/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906 -inputs=/dev/null,$*.trans-hip-amdgcn-amd-amdhsa-gfx906.out -outputs=$*.trans.co 93 | 94 | clean_temp: 95 | rm -f */*.s */*.out */*.o */*.bc */*.txt */*.cui 96 | 97 | clean: 98 | rm -f */*.co */*.bc */*.cui */*.o */*.out */*.txt */*.s */*.pointer.cu */*.prtopt.cu */*.ptrraw.cu */*.preempt.cu -------------------------------------------------------------------------------- /resource/README.md: -------------------------------------------------------------------------------- 1 | # REEF Resource 2 | 3 | This directory contains the device code of some DNN models. 4 | 5 | ## Build 6 | 7 | ``` 8 | $ make transform 9 | $ make build 10 | ``` 11 | 12 | ## DNN Model 13 | 14 | A DNN model that is loadable in REEF should contain three file: 15 | 16 | 1. GPU device code (model.cu) 17 | 2. Kernel schedule (model.json) 18 | 3. Model parameter (model.param, Optional) 19 | 20 | All of the files can be generated by TVM (a customized version). (TODO) 21 | 22 | Other files will be generated automatically, including: 23 | 24 | 1. GPU device code of RT tasks (model.trans.cu) 25 | 2. GPU device code of BE tasks (model.be.cu) 26 | 3. Kernel latency profile (model.profile.json) 27 | 28 | ## Example 29 | 30 | See the `mocked_kernel` directory. 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /resource/alexnet/alexnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": [ 3 | { 4 | "name": "data", 5 | "size": 150528, 6 | "stype": "float32" 7 | }, 8 | { 9 | "name": "conv_1_weight", 10 | "size": 34848, 11 | "stype": "float32" 12 | }, 13 | { 14 | "name": "conv_2_weight", 15 | "size": 614400, 16 | "stype": "float32" 17 | }, 18 | { 19 | "name": "conv_3_weight", 20 | "size": 884736, 21 | "stype": "float32" 22 | }, 23 | { 24 | "name": "conv_4_weight", 25 | "size": 1327104, 26 | "stype": "float32" 27 | }, 28 | { 29 | "name": "conv_5_weight", 30 | "size": 884736, 31 | "stype": "float32" 32 | }, 33 | { 34 | "name": "dense_1_weight", 35 | "size": 26214400, 36 | "stype": "float32" 37 | }, 38 | { 39 | "name": "bias_1_weight", 40 | "size": 4096, 41 | "stype": "float32" 42 | }, 43 | { 44 | "name": "dense_2_weight", 45 | "size": 16777216, 46 | "stype": "float32" 47 | }, 48 | { 49 | "name": "bias_2_weight", 50 | "size": 4096, 51 | "stype": "float32" 52 | }, 53 | { 54 | "name": "dense_3_weight", 55 | "size": 4096000, 56 | "stype": "float32" 57 | }, 58 | { 59 | "name": "bias_3_weight", 60 | "size": 1000, 61 | "stype": "float32" 62 | }, 63 | { 64 | "name": "null", 65 | "size": 279936, 66 | "stype": "float32" 67 | }, 68 | { 69 | "name": "null", 70 | "size": 64896, 71 | "stype": "float32" 72 | }, 73 | { 74 | "name": "output", 75 | "size": 6400, 76 | "stype": "float32" 77 | }, 78 | { 79 | "name": "null", 80 | "size": 1000, 81 | "stype": "float32" 82 | } 83 | ], 84 | "kernels": [ 85 | { 86 | "name": "fused_nn_conv2d_nn_relu_4_kernel0", 87 | "launch_params": [ 88 | 27, 89 | 9, 90 | 12, 91 | 2, 92 | 6, 93 | 8 94 | ], 95 | "args": [ 96 | 0, 97 | 1, 98 | 12 99 | ] 100 | }, 101 | { 102 | "name": "fused_nn_max_pool2d_2_kernel0", 103 | "launch_params": [ 104 | 507, 105 | 1, 106 | 1, 107 | 128, 108 | 1, 109 | 1 110 | ], 111 | "args": [ 112 | 12, 113 | 13 114 | ] 115 | }, 116 | { 117 | "name": "fused_nn_conv2d_nn_relu_3_kernel0", 118 | "launch_params": [ 119 | 13, 120 | 13, 121 | 32, 122 | 2, 123 | 2, 124 | 4 125 | ], 126 | "args": [ 127 | 13, 128 | 2, 129 | 12 130 | ] 131 | }, 132 | { 133 | "name": "fused_nn_max_pool2d_1_kernel0", 134 | "launch_params": [ 135 | 288, 136 | 1, 137 | 1, 138 | 128, 139 | 1, 140 | 1 141 | ], 142 | "args": [ 143 | 12, 144 | 13 145 | ] 146 | }, 147 | { 148 | "name": "fused_nn_conv2d_nn_relu_2_kernel0", 149 | "launch_params": [ 150 | 3, 151 | 6, 152 | 12, 153 | 4, 154 | 2, 155 | 32 156 | ], 157 | "args": [ 158 | 13, 159 | 3, 160 | 12 161 | ] 162 | }, 163 | { 164 | "name": "fused_nn_conv2d_nn_relu_1_kernel0", 165 | "launch_params": [ 166 | 3, 167 | 6, 168 | 12, 169 | 4, 170 | 2, 171 | 32 172 | ], 173 | "args": [ 174 | 12, 175 | 4, 176 | 13 177 | ] 178 | }, 179 | { 180 | "name": "fused_nn_conv2d_nn_relu_kernel0", 181 | "launch_params": [ 182 | 3, 183 | 6, 184 | 8, 185 | 4, 186 | 2, 187 | 32 188 | ], 189 | "args": [ 190 | 13, 191 | 5, 192 | 12 193 | ] 194 | }, 195 | { 196 | "name": "fused_nn_max_pool2d_kernel0", 197 | "launch_params": [ 198 | 50, 199 | 1, 200 | 1, 201 | 128, 202 | 1, 203 | 1 204 | ], 205 | "args": [ 206 | 12, 207 | 13 208 | ] 209 | }, 210 | { 211 | "name": "fused_nn_batch_flatten_kernel0", 212 | "launch_params": [ 213 | 50, 214 | 1, 215 | 1, 216 | 128, 217 | 1, 218 | 1 219 | ], 220 | "args": [ 221 | 14, 222 | 13 223 | ] 224 | }, 225 | { 226 | "name": "fused_nn_dense_add_nn_relu_1_kernel0", 227 | "launch_params": [ 228 | 4096, 229 | 1, 230 | 1, 231 | 64, 232 | 1, 233 | 1 234 | ], 235 | "args": [ 236 | 14, 237 | 6, 238 | 13, 239 | 7 240 | ] 241 | }, 242 | { 243 | "name": "fused_nn_dense_add_nn_relu_kernel0", 244 | "launch_params": [ 245 | 4096, 246 | 1, 247 | 1, 248 | 64, 249 | 1, 250 | 1 251 | ], 252 | "args": [ 253 | 13, 254 | 8, 255 | 14, 256 | 9 257 | ] 258 | }, 259 | { 260 | "name": "fused_nn_dense_add_kernel0", 261 | "launch_params": [ 262 | 1000, 263 | 1, 264 | 1, 265 | 64, 266 | 1, 267 | 1 268 | ], 269 | "args": [ 270 | 14, 271 | 10, 272 | 15, 273 | 11 274 | ] 275 | }, 276 | { 277 | "name": "fused_nn_softmax_kernel0", 278 | "launch_params": [ 279 | 1, 280 | 1, 281 | 1, 282 | 64, 283 | 1, 284 | 1 285 | ], 286 | "args": [ 287 | 15, 288 | 14 289 | ] 290 | } 291 | ], 292 | "args": [ 293 | 0, 294 | 1, 295 | 2, 296 | 3, 297 | 4, 298 | 5, 299 | 6, 300 | 7, 301 | 8, 302 | 9, 303 | 10, 304 | 11 305 | ], 306 | "shared_memory": { 307 | "fused_nn_max_pool2d_kernel0": 4, 308 | "fused_nn_softmax_kernel0": 4, 309 | "fused_nn_conv2d_nn_relu_3_kernel0": 944, 310 | "fused_nn_conv2d_nn_relu_1_kernel0": 9984, 311 | "fused_nn_dense_add_kernel0": 4, 312 | "fused_nn_max_pool2d_2_kernel0": 4, 313 | "fused_nn_dense_add_nn_relu_1_kernel0": 4, 314 | "fused_nn_conv2d_nn_relu_4_kernel0": 1356, 315 | "fused_nn_max_pool2d_1_kernel0": 4, 316 | "fused_nn_conv2d_nn_relu_2_kernel0": 9984, 317 | "fused_nn_batch_flatten_kernel0": 4, 318 | "fused_nn_conv2d_nn_relu_kernel0": 9984, 319 | "fused_nn_dense_add_nn_relu_kernel0": 4 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /resource/alexnet/alexnet.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":8851,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":76, "latency":[42,40,40,42,42,44,44,48,50,50,54,58,58,64,64,68,74,72,78,80,84,88,92,92,96,78,74,74,84,74,98,76]},"fused_nn_dense_add_nn_relu_kernel0":{"total_latency":200, "latency":[42,40,40,40,42,42,44,46,48,50,54,58,58,62,64,70,72,74,78,82,84,90,90,94,98,100,102,108,108,112,118,120]},"fused_nn_conv2d_nn_relu_kernel0":{"total_latency":664, "latency":[276,458,664,876,1082,1292,1294]},"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_dense_add_nn_relu_1_kernel0":{"total_latency":168, "latency":[54,54,54,54,54,54,54,54,56,52,52,52,52,52,52,52,52,54,54,54,54,54,56,56,56,58,58,60,60,60,60,62]},"fused_nn_max_pool2d_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_conv2d_nn_relu_4_kernel0":{"total_latency":3884, "latency":[148,146,160,190,248,320,382,446,502,558,610,660,722,776,838,894]},"fused_nn_conv2d_nn_relu_3_kernel0":{"total_latency":2196, "latency":[336,340,348,356,366,378,390,408,380,398,402,416,422,442,452,466,478,494,510,524]},"fused_nn_conv2d_nn_relu_1_kernel0":{"total_latency":876, "latency":[276,456,664,876,1084,1294,1294]},"fused_nn_max_pool2d_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_max_pool2d_2_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_conv2d_nn_relu_2_kernel0":{"total_latency":588, "latency":[192,304,446,588,728,866,866]}}} -------------------------------------------------------------------------------- /resource/bert/bert.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":5426,"kernel_latency":{"fused_reshape_add_multiply_erf_multiply_add_multiply_reshape_kernel0":{"total_latency":16, "latency":[14,14,14,16,16,16,16,18]},"fused_reshape_add_add_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_reshape_transpose_copy_reshape_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_batch_matmul_4_kernel0":{"total_latency":84, "latency":[62,66,72,82,92,112,120,130]},"fused_reshape_5_kernel0":{"total_latency":44, "latency":[30,32,34,36,38,42,44,42]},"fused_nn_softmax_1_kernel3":{"total_latency":48, "latency":[36,38,40,44,46,48,52,54]},"fused_variance_1_kernel1":{"total_latency":12, "latency":[12,12]},"fused_mean_1_kernel0":{"total_latency":16, "latency":[16,16]},"fused_reshape_4_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_batch_matmul_3_kernel0":{"total_latency":80, "latency":[68,72,80,94]},"fused_reshape_add_reshape_transpose_reshape_transpose_kernel0":{"total_latency":24, "latency":[16,16,18,20,22,24,24,24]},"fused_full_equal_reshape_kernel0":{"total_latency":12, "latency":[16,12]},"fused_mean_1_kernel1":{"total_latency":12, "latency":[12,12]},"fused_nn_softmax_1_kernel0":{"total_latency":52, "latency":[50,94,136,178,220,258,320,364]},"fused_cast_take_broadcast_to_like_cast_take_add_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,16]},"fused_variance_1_kernel0":{"total_latency":16, "latency":[16,18]},"fused_reshape_add_reshape_transpose_divide_reshape_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18]},"fused_reshape_add_reshape_transpose_transpose_reshape_transpose_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18]},"fused_subtract_add_sqrt_divide_multiply_add_1_kernel0":{"total_latency":20, "latency":[16,16,16,18,18,18,20,20]},"fused_nn_softmax_1_kernel1":{"total_latency":52, "latency":[36,38,42,48,48,52,58,62]},"fused_nn_batch_matmul_5_kernel0":{"total_latency":52, "latency":[20,20,22,24]},"fused_reshape_cast_broadcast_to_like_where_kernel0":{"total_latency":60, "latency":[48,48,50,52,54,58,60,60]},"fused_nn_softmax_1_kernel2":{"total_latency":52, "latency":[50,94,136,178,218,260,314,362]}}} -------------------------------------------------------------------------------- /resource/densenet/densenet.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":3743,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,16,16,16,16,16,16,16,16,16,16,16,16]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[12]},"fused_nn_avg_pool2d_kernel0":{"total_latency":20, "latency":[18]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":40, "latency":[40]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":48, "latency":[46,48,50,52,52,54,56,60,60,64,66,70,76,78,84,86,94,96,100,102,110,112,116,120,126,128,132,136,144,146,148,152]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":16, "latency":[14,16]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":36, "latency":[36,36,38,38,38,40,38,40,40,42,42,42,46,46,48,48,50,50,50,52,52,54,54,56,58,60,62,62,64,66,66,68]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":36, "latency":[34,36,40,42,44,50,50,58,58,62,64,66,68,70,76,78,86,88,94,96,100,104,108,110,114,116,122,124]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":20, "latency":[20,20,22,22,24,26,28]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":660, "latency":[34,40,52,62,76,88,100,112]},"fused_nn_avg_pool2d_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,16,16,16,16,16]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,16,16,16,18,18,18]},"fused_nn_avg_pool2d_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,16]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":32, "latency":[32,32]},"fused_nn_conv2d_1_kernel0":{"total_latency":16, "latency":[14,14]},"fused_nn_conv2d_2_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,18]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":36, "latency":[34,36,38,42,44,50,50,56,56,58,62,64,66,70]},"fused_nn_avg_pool2d_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":40, "latency":[40,44,46,54]},"fused_nn_conv2d_kernel0":{"total_latency":16, "latency":[14,14,14,16]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":16, "latency":[14,14,14,14]}}} -------------------------------------------------------------------------------- /resource/inception/inception.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":8273,"kernel_latency":{"fused_nn_softmax_1_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_dense_add_kernel0":{"total_latency":56, "latency":[26,26,26,30,30,32,34,36,38,40,42,44,44,50,50,48,56,50,52,54,56,56,60,62,62,52,54,52,54,54,60,54]},"fused_nn_avg_pool2d_6_kernel0":{"total_latency":20, "latency":[20,30,38,46,54,62,72,80]},"fused_nn_max_pool2d_9_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_34_kernel0":{"total_latency":108, "latency":[90,90,96,108,114,122,128,132,154,158,178,184,212,216,232,240,264,270,270,270,270,270,270,270,270,270,270,270,270,270,270,270]},"fused_concatenate_6_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_batch_flatten_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_36_kernel0":{"total_latency":72, "latency":[66,70,70,70,70,68,68,68,68,68,68,68,68,68,68,104]},"fused_nn_conv2d_add_nn_relu_33_kernel0":{"total_latency":48, "latency":[44,44,44,44,44,44,48,74]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":100, "latency":[94,120,162,162]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_5_kernel1":{"total_latency":44, "latency":[30,32,34,42,40,44,48,48,48,48,48,50]},"fused_nn_conv2d_add_nn_relu_32_kernel0":{"total_latency":60, "latency":[58,72,74,74]},"fused_nn_conv2d_add_nn_relu_31_kernel0":{"total_latency":60, "latency":[60,76,106,128]},"fused_nn_conv2d_add_nn_relu_29_kernel0":{"total_latency":48, "latency":[46,50,54,60,66,74,74,74,74,74,74,74,74,74,74,74]},"fused_nn_conv2d_add_nn_relu_30_kernel0":{"total_latency":60, "latency":[60,72,74,74]},"fused_concatenate_7_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_28_kernel0":{"total_latency":88, "latency":[82,100,108,124,146,148,146,146,146,146,146,146,146,146,146,146]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":104, "latency":[76,88,102,130,138,138,138,138,138,138,138,138,138,138,138,138]},"fused_nn_conv2d_add_nn_relu_26_kernel0":{"total_latency":116, "latency":[90,112,132,132]},"fused_nn_conv2d_add_nn_relu_25_kernel0":{"total_latency":92, "latency":[80,100,118]},"fused_nn_conv2d_add_nn_relu_22_kernel0":{"total_latency":188, "latency":[126,146,178,216,238,232,228,230]},"fused_concatenate_8_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16]},"fused_nn_avg_pool2d_10_kernel0":{"total_latency":20, "latency":[14,14,14,14,14,14,16,16]},"fused_nn_conv2d_add_nn_relu_19_kernel0":{"total_latency":144, "latency":[112,118,130,142,146,168,172,186,198,216,222,236,254,264,286,298,328,340,356,368,392,406,422,432]},"fused_nn_avg_pool2d_11_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_20_kernel0":{"total_latency":80, "latency":[78,78]},"fused_nn_conv2d_add_nn_relu_17_kernel0":{"total_latency":132, "latency":[104,118,136,130,130,150,170,192,222,242,266,284,308,326]},"fused_nn_conv2d_add_nn_relu_18_kernel0":{"total_latency":80, "latency":[78,80,82,86,86,96]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":76, "latency":[64,82,90]},"fused_nn_max_pool2d_8_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":32, "latency":[30,36]},"fused_nn_max_pool2d_5_kernel0":{"total_latency":28, "latency":[14,16,16,16,18,18,18,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,18,16,16,16,16,16,18,18,18,18,18]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":60, "latency":[56,78,104]},"fused_nn_conv2d_add_nn_relu_27_kernel0":{"total_latency":188, "latency":[142,152,162,190,190,240,234,286,266,300,304,326,350,366,400,416,460,472,506,522]},"fused_nn_avg_pool2d_9_kernel0":{"total_latency":28, "latency":[16,16,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel2":{"total_latency":44, "latency":[14,14,14,16,16,18,18,18,20,20,20,20,22,22,24,24]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":32, "latency":[26,28,30,30,32,36]},"fused_nn_conv2d_add_nn_relu_16_kernel0":{"total_latency":80, "latency":[78]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,14,16,16,16,16,16,16,18,18,18,18,20,20,20]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":48, "latency":[46,48]},"fused_nn_max_pool2d_7_kernel0":{"total_latency":20, "latency":[16,16,16,16,18,18,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_5_kernel2":{"total_latency":16, "latency":[16,16,16,18,18,18,18,20]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":176, "latency":[100,106,114,136,140,156,176,190,220,234,262,274,310,322,346,358]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":32, "latency":[30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_4_kernel0":{"total_latency":16, "latency":[16,16,16,16,18,18,18,18,20,20,20,22,22,22,24,24]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":32, "latency":[30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel2":{"total_latency":56, "latency":[14,14,14,16,16,16,18,18,18,18,18,18,20,20,20,22]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel0":{"total_latency":32, "latency":[14,14,14,16,16,16,16,18,18,18,18,18,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel2":{"total_latency":32, "latency":[14,14,14,16,16,16,16,18,18,18,18,20,20,20,20,22]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel0":{"total_latency":36, "latency":[14,16,16,16,16,18,18,18,20,20,22,22,22,24,24,26]},"fused_nn_conv2d_add_nn_relu_23_kernel0":{"total_latency":80, "latency":[76,80,82,80,86,96]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel1":{"total_latency":88, "latency":[20,22,24,26,28]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":40, "latency":[38,42]},"fused_nn_conv2d_add_nn_relu_24_kernel0":{"total_latency":168, "latency":[120,130,140,156,166,206,204,252,244,276,274,304,314,328,326,326]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":44, "latency":[42,40,42,44,46]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel1":{"total_latency":100, "latency":[30,32,36,38,44,48]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_4_kernel1":{"total_latency":32, "latency":[22,24,24,26,26,30,30,32,34,34]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_4_kernel2":{"total_latency":16, "latency":[14,16,16,16,16,18,18,18,18,18,18,18,18,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,18,18,18,18,20,20,20,22,22,22,24]},"fused_concatenate_10_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel1":{"total_latency":40, "latency":[30,32,38,38,42]},"fused_nn_avg_pool2d_8_kernel0":{"total_latency":20, "latency":[14,14,14,14,14,14,16,16]},"fused_concatenate_11_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,18]},"fused_nn_conv2d_add_nn_relu_35_kernel0":{"total_latency":132, "latency":[94,110,122,132,132,156,160,174,182,192,180,182,198,172,174,176,198,172,180,184,172,172,180,180,168,168,168,168,174,174,176,176]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_5_kernel0":{"total_latency":20, "latency":[18,18,22,22,26,28,30,32]},"fused_nn_conv2d_add_nn_relu_13_kernel0":{"total_latency":40, "latency":[38,40,44,46,52,58]},"fused_nn_conv2d_add_nn_relu_14_kernel0":{"total_latency":36, "latency":[36,38,42]},"fused_nn_conv2d_add_nn_relu_21_kernel0":{"total_latency":96, "latency":[80,100,118]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel1":{"total_latency":48, "latency":[20,22,24,26,28]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":464, "latency":[328,376,430,428,476,552,652,730]},"fused_nn_max_pool2d_6_kernel0":{"total_latency":32, "latency":[14,14,16,16,16,18,18,18]},"fused_nn_avg_pool2d_7_kernel0":{"total_latency":24, "latency":[16,16,18,16,16,16,18,18]},"fused_nn_conv2d_add_nn_relu_15_kernel0":{"total_latency":84, "latency":[80,98,130]},"fused_concatenate_9_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]}}} -------------------------------------------------------------------------------- /resource/mobilenet/mobilenet.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":1188,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":24, "latency":[16,16,16,16,16,16,18,18,18,18,18,18,20,20,20,20,20,20,20,22,22,22,22,22,24,20,20,22,22,20,20,20]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":128, "latency":[120,136]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":20, "latency":[16,16,16,18,18,20,20,20]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":72, "latency":[66,70]},"fused_nn_conv2d_add_nn_relu_13_kernel0":{"total_latency":20, "latency":[16,18]},"fused_nn_conv2d_add_nn_relu_14_kernel0":{"total_latency":28, "latency":[20,22,24,26,28,32]},"fused_nn_conv2d_add_nn_relu_16_kernel0":{"total_latency":24, "latency":[16,18,20,22]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":24, "latency":[16,20]},"fused_nn_conv2d_add_nn_relu_18_kernel0":{"total_latency":104, "latency":[56,96]},"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":24, "latency":[22,24,22,22,26,26,24,24,24,24,28,24,28,28,24,24,28,28,26,26,26,30,30,30,30,28,28,28,30,30,30,30]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":80, "latency":[70,74,78,94]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14,14,14]},"fused_nn_conv2d_add_nn_relu_15_kernel0":{"total_latency":28, "latency":[16,20]},"fused_nn_conv2d_add_nn_relu_17_kernel0":{"total_latency":20, "latency":[16,16,16,18]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":36, "latency":[24,26,30,34]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":32, "latency":[32,32]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":20, "latency":[16,16,18,18,18,18,20,20]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,18,20,22]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":20, "latency":[16,16,18,18]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":52, "latency":[50,50]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":44, "latency":[40,46,46,52]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,18,18,18]}}} -------------------------------------------------------------------------------- /resource/mocked_kernel/mocked_kernel.be.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 64 4 | #define NUM_TREHAD_PER_BLOCK 128 5 | #define BLOCKDIM_X 4 6 | #define BLOCKDIM_Y 8 7 | #define BLOCKDIM_Z 4 8 | 9 | __device__ void multiply_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){ 10 | __shared__ float buffer[1024 * 32 / 4]; 11 | int blockOffset = blockIdx.x; 12 | int blockSize = NUM_TREHAD_PER_BLOCK; 13 | int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y; 14 | int arrayOffset = blockOffset * blockSize + threadOffset; 15 | 16 | temp[arrayOffset] = a[threadOffset] * b[arrayOffset]; 17 | } 18 | 19 | __device__ void add_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){ 20 | __shared__ float buffer[1024 * 32 / 4]; 21 | int blockOffset = blockIdx.x; 22 | int blockSize = NUM_TREHAD_PER_BLOCK; 23 | int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y; 24 | int arrayOffset = blockOffset * blockSize + threadOffset; 25 | 26 | temp[arrayOffset] = a[threadOffset] + b[arrayOffset]; 27 | } 28 | 29 | extern "C" __global__ void multiply(int* preempted, int* task_slot, float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) { 30 | if (*preempted) return; 31 | multiply_device(a, b, temp); 32 | if (threadIdx.x + threadIdx.y + threadIdx.z == 0) 33 | atomicAdd(task_slot, 1); 34 | } 35 | 36 | extern "C" __global__ void add(int* preempted, int* task_slot, float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) { 37 | if (*preempted) return; 38 | add_device(a, b, temp); 39 | if (threadIdx.x + threadIdx.y + threadIdx.z == 0) 40 | atomicAdd(task_slot, 1); 41 | } 42 | -------------------------------------------------------------------------------- /resource/mocked_kernel/mocked_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 64 4 | #define NUM_TREHAD_PER_BLOCK 128 5 | #define BLOCKDIM_X 4 6 | #define BLOCKDIM_Y 8 7 | #define BLOCKDIM_Z 4 8 | 9 | extern "C" __global__ void multiply(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) { 10 | __shared__ float buffer[1024 * 32 / 4]; 11 | int blockOffset = blockIdx.x; 12 | int blockSize = NUM_TREHAD_PER_BLOCK; 13 | int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y; 14 | int arrayOffset = blockOffset * blockSize + threadOffset; 15 | 16 | temp[arrayOffset] = a[threadOffset] * b[arrayOffset]; 17 | } 18 | 19 | extern "C" __global__ void add(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) { 20 | __shared__ float buffer[1024 * 32 / 4]; 21 | int blockOffset = blockIdx.x; 22 | int blockSize = NUM_TREHAD_PER_BLOCK; 23 | int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y; 24 | int arrayOffset = blockOffset * blockSize + threadOffset; 25 | 26 | temp[arrayOffset] = a[threadOffset] + b[arrayOffset]; 27 | } 28 | -------------------------------------------------------------------------------- /resource/mocked_kernel/mocked_kernel.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": [ 3 | { 4 | "name": "a", 5 | "stype": "float32", 6 | "size": 8192 7 | }, 8 | { 9 | "name": "b", 10 | "stype": "float32", 11 | "size": 8192 12 | }, 13 | { 14 | "name": "temp", 15 | "stype": "float32", 16 | "size": 8192 17 | }, 18 | { 19 | "name": "c", 20 | "stype": "float32", 21 | "size": 8192 22 | }, 23 | { 24 | "name": "output", 25 | "stype": "float32", 26 | "size": 8192 27 | } 28 | ], 29 | "kernels": [ 30 | { 31 | "name": "add", 32 | "launch_params": [ 33 | 64, 34 | 1, 35 | 1, 36 | 4, 37 | 8, 38 | 4 39 | ], 40 | "args": [ 41 | 1, 42 | 0, 43 | 2 44 | ] 45 | }, 46 | { 47 | "name": "multiply", 48 | "launch_params": [ 49 | 64, 50 | 1, 51 | 1, 52 | 4, 53 | 8, 54 | 4 55 | ], 56 | "args": [ 57 | 2, 58 | 3, 59 | 4 60 | ] 61 | } 62 | ], 63 | "args": [ 64 | 0, 65 | 1, 66 | 3 67 | ], 68 | "shared_memory": { 69 | "add": 1024, 70 | "multiply": 1024 71 | } 72 | } -------------------------------------------------------------------------------- /resource/mocked_kernel/mocked_kernel.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":30,"kernel_latency":{"multiply":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"add":{"total_latency":16, "latency":[16,14,14,14,14,14,14,14,14,14,14,14,16,14,14,14]}}} -------------------------------------------------------------------------------- /resource/mocked_kernel/mocked_kernel.trans.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 64 4 | #define NUM_TREHAD_PER_BLOCK 128 5 | #define BLOCKDIM_X 4 6 | #define BLOCKDIM_Y 8 7 | #define BLOCKDIM_Z 4 8 | 9 | #define CU_NUM 60 10 | 11 | __device__ __forceinline__ bool is_first_thread() { 12 | return threadIdx.x == 0; 13 | } 14 | 15 | __device__ __forceinline__ unsigned int get_cu_id() { 16 | return blockIdx.x % CU_NUM; 17 | } 18 | 19 | __device__ __forceinline__ dim3 get_3d_idx(int idx, dim3 dim) { 20 | dim3 result; 21 | result.x = idx % dim.x; 22 | result.y = idx / dim.x % dim.y; 23 | result.z = idx / (dim.x * dim.y); 24 | return result; 25 | } 26 | 27 | __device__ void multiply_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){ 28 | __shared__ float buffer[1024 * 32 / 4]; 29 | int blockOffset = blockIdx.x; 30 | int blockSize = NUM_TREHAD_PER_BLOCK; 31 | int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y; 32 | int arrayOffset = blockOffset * blockSize + threadOffset; 33 | 34 | temp[arrayOffset] = a[threadOffset] * b[arrayOffset]; 35 | } 36 | 37 | __device__ void add_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){ 38 | __shared__ float buffer[1024 * 32 / 4]; 39 | int blockOffset = blockIdx.x; 40 | int blockSize = NUM_TREHAD_PER_BLOCK; 41 | int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y; 42 | int arrayOffset = blockOffset * blockSize + threadOffset; 43 | 44 | temp[arrayOffset] = a[threadOffset] + b[arrayOffset]; 45 | } 46 | 47 | extern "C" __global__ __attribute__((amdgpu_num_vgpr(25))) __attribute__((amdgpu_num_sgpr(30))) void multiply_device_wrapper(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) { 48 | // Force the compiler to use all the index 49 | if (threadIdx.x + threadIdx.y * 4 + threadIdx.z * 8 * 4 >= 4 * 8 * 4) return; 50 | // if (blockIdx.x + blockIdx.y * 64 + blockIdx.z * 1 * 64 >= 64 * 1 * 1) return; 51 | multiply_device((float* __restrict__)a,(float* __restrict__)b,(float* __restrict__)temp); 52 | asm volatile(";; end_flag"); // jump back to the caller 53 | } 54 | 55 | extern "C" __global__ __attribute__((amdgpu_num_vgpr(25))) __attribute__((amdgpu_num_sgpr(30))) void add_device_wrapper(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) { 56 | // Force the compiler to use all the index 57 | if (threadIdx.x + threadIdx.y * 4 + threadIdx.z * 8 * 4 >= 4 * 8 * 4) return; 58 | // if (blockIdx.x + blockIdx.y * 64 + blockIdx.z * 1 * 64 >= 64 * 1 * 1) return; 59 | add_device((float* __restrict__)a,(float* __restrict__)b,(float* __restrict__)temp); 60 | asm volatile(";; end_flag"); // jump back to the caller 61 | } 62 | 63 | extern "C" __global__ void multiply( 64 | void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l, 65 | void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r, 66 | int cu_partition) { 67 | asm volatile(";; caller_flag"); 68 | return; 69 | } 70 | 71 | extern "C" __global__ void add( 72 | void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l, 73 | void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r, 74 | int cu_partition) { 75 | asm volatile(";; caller_flag"); 76 | return; 77 | } 78 | 79 | extern "C" __device__ __noinline__ dim3 get_3d_idx_64_1_1(int idx) { 80 | dim3 dim(64, 1, 1); 81 | dim3 result; 82 | result.x = idx % dim.x; 83 | result.y = idx / dim.x % dim.y; 84 | result.z = idx / (dim.x * dim.y); 85 | return result; 86 | } 87 | 88 | extern "C" __device__ __noinline__ dim3 get_3d_idx_4_8_4(int idx) { 89 | dim3 dim(4, 8, 4); 90 | dim3 result; 91 | result.x = idx % dim.x; 92 | result.y = idx / dim.x % dim.y; 93 | result.z = idx / (dim.x * dim.y); 94 | return result; 95 | } 96 | 97 | __global__ void get_3d_idx_caller(int* buf) { 98 | dim3 task_idx; 99 | 100 | task_idx = get_3d_idx_64_1_1(threadIdx.x); 101 | buf[task_idx.x] = task_idx.x; 102 | buf[task_idx.y] = task_idx.y; 103 | buf[task_idx.z] = task_idx.z; 104 | 105 | task_idx = get_3d_idx_4_8_4(threadIdx.x); 106 | buf[task_idx.x] = task_idx.x; 107 | buf[task_idx.y] = task_idx.y; 108 | buf[task_idx.z] = task_idx.z; 109 | 110 | } 111 | 112 | #define CALL_FRAMEWORK(idx) \ 113 | extern "C" __global__ void call_framework_##idx(\ 114 | void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,\ 115 | void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,\ 116 | int cu_partition) \ 117 | {\ 118 | asm volatile(\ 119 | " s_load_dwordx2 s[14:15], s[4:5], 0x0\n"\ 120 | " s_waitcnt lgkmcnt(0)\n"\ 121 | " s_setpc_b64 s[14:15]\n"\ 122 | " s_endpgm\n"\ 123 | );\ 124 | } 125 | 126 | CALL_FRAMEWORK(1) 127 | CALL_FRAMEWORK(2) 128 | CALL_FRAMEWORK(3) 129 | CALL_FRAMEWORK(4) 130 | CALL_FRAMEWORK(5) 131 | CALL_FRAMEWORK(6) 132 | CALL_FRAMEWORK(7) 133 | CALL_FRAMEWORK(8) 134 | CALL_FRAMEWORK(9) 135 | CALL_FRAMEWORK(10) 136 | 137 | #define MERGE_FRAMEWORK(idx) \ 138 | extern "C" __global__ void merge_framework_##idx(\ 139 | void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,\ 140 | void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,\ 141 | int cu_partition) \ 142 | {\ 143 | asm volatile(\ 144 | " s_load_dword s10, s[4:5], 0x40\n"\ 145 | " s_load_dwordx2 s[12:13], s[4:5], 0x0\n"\ 146 | " s_load_dwordx2 s[14:15], s[4:5], 0x20\n"\ 147 | " s_mul_hi_u32 s11, s6, 0x88888889\n"\ 148 | " s_lshr_b32 s11, s11, 5\n"\ 149 | " s_mul_i32 s11, s11, 60\n"\ 150 | " s_sub_i32 s11, s6, s11\n"\ 151 | " s_waitcnt lgkmcnt(0)\n"\ 152 | " s_cmp_ge_u32 s11, s10\n"\ 153 | " s_mov_b64 s[10:11], -1\n"\ 154 | " s_cbranch_scc1 MyBB"#idx"_3\n"\ 155 | "; %bb.1: ; %Flow\n"\ 156 | " s_andn2_b64 vcc, exec, s[10:11]\n"\ 157 | " s_cbranch_vccz MyBB"#idx"_4\n"\ 158 | " s_endpgm\n"\ 159 | "MyBB"#idx"_3:\n"\ 160 | " s_setpc_b64 s[14:15]\n"\ 161 | " s_endpgm\n"\ 162 | "MyBB"#idx"_4:\n"\ 163 | " s_setpc_b64 s[12:13]\n"\ 164 | " s_endpgm\n"\ 165 | );\ 166 | } 167 | MERGE_FRAMEWORK(1) 168 | MERGE_FRAMEWORK(2) 169 | MERGE_FRAMEWORK(3) 170 | MERGE_FRAMEWORK(4) 171 | MERGE_FRAMEWORK(5) 172 | MERGE_FRAMEWORK(6) 173 | MERGE_FRAMEWORK(7) 174 | MERGE_FRAMEWORK(8) 175 | MERGE_FRAMEWORK(9) 176 | MERGE_FRAMEWORK(10) 177 | MERGE_FRAMEWORK(nostack_1) 178 | MERGE_FRAMEWORK(nostack_2) 179 | MERGE_FRAMEWORK(nostack_3) 180 | MERGE_FRAMEWORK(nostack_4) 181 | MERGE_FRAMEWORK(nostack_5) 182 | MERGE_FRAMEWORK(nostack_6) 183 | MERGE_FRAMEWORK(nostack_7) 184 | MERGE_FRAMEWORK(nostack_8) 185 | MERGE_FRAMEWORK(nostack_9) 186 | MERGE_FRAMEWORK(nostack_10) 187 | -------------------------------------------------------------------------------- /resource/resnet/resnet.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":10958,"kernel_latency":{"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_dense_add_kernel0":{"total_latency":56, "latency":[26,26,28,32,32,32,34,38,40,40,42,44,46,50,52,52,58,56,54,56,58,60,60,64,68,56,60,60,58,64,66,64]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":24, "latency":[22,24,22,22,24,24,24,24,24,26,28,28,28,24,28,24,28,24,28,26,26,30,30,28,28,30,30,30,30,30,30,30]},"fused_nn_conv2d_add_multiply_add_nn_relu_kernel0":{"total_latency":84, "latency":[76,76,84,96]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":200, "latency":[150,204]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":52, "latency":[50,56]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,18,18,18,18,18,20,20,20,20,22]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel1":{"total_latency":56, "latency":[34,36,42,46,54,64,70,70]},"fused_nn_conv2d_kernel0":{"total_latency":32, "latency":[18,20,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel2":{"total_latency":16, "latency":[16,16,16,16,18,18,18,18]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":44, "latency":[30,32,34,36,38,40,44,48,54,56,62,64,68,72,76,80]},"fused_nn_conv2d_2_kernel0":{"total_latency":124, "latency":[98,114]},"fused_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,18,18,20,20,22,22,22]},"fused_nn_conv2d_add_1_kernel0":{"total_latency":60, "latency":[50,58]},"fused_nn_conv2d_add_add_nn_relu_kernel0":{"total_latency":88, "latency":[50,86]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,20,22,24,26,28,30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel1":{"total_latency":28, "latency":[18,20,22,24]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,14,16,16,18,18,18]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":24, "latency":[18,18,18,20,20,20,22,22,24,26,26,26,28,28,30,30]},"fused_nn_conv2d_add_2_kernel0":{"total_latency":40, "latency":[36,40,48,46]},"fused_nn_conv2d_add_3_kernel0":{"total_latency":36, "latency":[20,24,26,28,32,32,34]},"fused_add_14_kernel0":{"total_latency":16, "latency":[18,14,14,14,14,14,16,14]},"fused_nn_conv2d_add_add_nn_relu_1_kernel0":{"total_latency":40, "latency":[36,40,48]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":104, "latency":[102,102,110,116]},"fused_nn_conv2d_1_kernel0":{"total_latency":100, "latency":[78,98]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":136, "latency":[100,130,180,236,358,416,454,478]},"fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel1":{"total_latency":48, "latency":[36,52]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":64, "latency":[62,68,72,90]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":80, "latency":[60,78,78,90]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,16,16,16,16,16,16]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,16,18,18,20,18,18,20,22,22,20,22,22,24,24]},"fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel2":{"total_latency":16, "latency":[14,16,16,16,16,16,16,16,16,16,16,16,18,18]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":500, "latency":[192,356]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel2":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16,16,16,16,16,16,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18,18,18,18,18,20,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel1":{"total_latency":68, "latency":[62,66,70,74,80,88,102,116]},"fused_nn_conv2d_3_kernel0":{"total_latency":408, "latency":[240,254]},"fused_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_add_nn_relu_2_kernel0":{"total_latency":40, "latency":[20,22,24,28,30,32,36]},"fused_nn_conv2d_add_kernel0":{"total_latency":84, "latency":[76,76,82,96]}}} -------------------------------------------------------------------------------- /resource/resnet152/resnet152.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":13233,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":56, "latency":[26,26,26,28,28,30,32,34,34,36,38,40,40,44,46,48,52,54,56,60,60,62,60,62,66,58,56,56,58,64,74,64]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[12,12,14,14,14,14,14,14]},"fused_nn_conv2d_add_kernel0":{"total_latency":60, "latency":[58,70,78,84,94,94,104,110,100,104,100,98,102,100,100,102]},"fused_nn_conv2d_3_kernel0":{"total_latency":136, "latency":[108,120,132,142,150,156,162,168,168,164,168,168,170,178,176,182,180,184,184,188,194,198,204,212,220,226,236,246]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":240, "latency":[240,240,256,262,282,310,310,306,308,310,310,312,310,310,314,310]},"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,16,16,16,16,16,16,16,16]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":84, "latency":[82,84,90,96,106,114,130,148]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":76, "latency":[74,78,86,94,106,114,126,134]},"fused_nn_conv2d_kernel0":{"total_latency":24, "latency":[22,24,26,30]},"fused_nn_conv2d_add_add_nn_relu_2_kernel0":{"total_latency":24, "latency":[22,24,28,30,32]},"fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":204, "latency":[200,282,362,470,584,698,700,700]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":84, "latency":[56,88,122,156,192,228,228]},"fused_nn_conv2d_add_3_kernel0":{"total_latency":20, "latency":[20,22,26,28]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":24, "latency":[22,24,26,28]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,16,16,16,16]},"fused_nn_conv2d_add_multiply_add_nn_relu_kernel0":{"total_latency":60, "latency":[58,70,78,86,92,104,106,108,102,102,102,102,102,100,98,100]},"fused_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_add_nn_relu_kernel0":{"total_latency":36, "latency":[34,34,34,36,36,40,44,46]},"fused_nn_conv2d_add_1_kernel0":{"total_latency":36, "latency":[34,34,34,36,36,38,40,44]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":36, "latency":[36,36,36,36]},"fused_nn_conv2d_2_kernel0":{"total_latency":88, "latency":[88,80,86,92,106,116,128,138]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":100, "latency":[100,160,230,300,370,440,440]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":160, "latency":[154,166,162,170,166,164,158,168,156,162,156,162,170,176,186,188,206,210,228,234,262,266,282,298,314,316,324,340,356,358,366,376]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":116, "latency":[116,116,118,120,124,128,130,134,138,142,146,148,154,156,162,166,174,178,184,190,196,202,208,212]},"fused_nn_conv2d_add_add_nn_relu_1_kernel0":{"total_latency":24, "latency":[18,20,22,22,22,22,22,22]},"fused_nn_conv2d_add_2_kernel0":{"total_latency":24, "latency":[18,20,20,22,22,22,20,20]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":36, "latency":[36,48,58,72,86]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":52, "latency":[50,54,64,74]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":32, "latency":[32,36,42,42,42,40,40,40]},"fused_add_14_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[12,14,14,14,14,14,14,16]},"fused_nn_conv2d_1_kernel0":{"total_latency":12, "latency":[10,10,12,12]},"fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]}}} -------------------------------------------------------------------------------- /resource/resnet18/resnet18.param: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SJTU-IPADS/reef/58dabe0a63fe6979349b358a78aa324cca050e4a/resource/resnet18/resnet18.param -------------------------------------------------------------------------------- /resource/resnet18/resnet18.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":2189,"kernel_latency":{"fused_nn_softmax_kernel0":{"total_latency":28, "latency":[28]},"fused_nn_dense_add_kernel0":{"total_latency":16, "latency":[14,14,16,14,14,14,14,14,16,18,16,16,16,16,16,16,18,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":24, "latency":[22,22,26,22,24,22,28,24,28,22,28,28,24,24,26,26,28,28,30,30,30,26,26,28,28,28,28,28,30,30,30,30]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_multiply_add_nn_re_11882905421691233276__kernel2":{"total_latency":16, "latency":[14,16,16,16,16,16,16,16,16,16,18,18,18,18,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel2":{"total_latency":16, "latency":[14,14,14,14,16,14,16,16,16,16,16,16,16,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel1":{"total_latency":68, "latency":[62,64,70,74,80,88,102,118]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_multiply_add_nn_re_11882905421691233276__kernel1":{"total_latency":68, "latency":[62,64,70,76,80,88,102,116]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel0":{"total_latency":16, "latency":[16,14,16,16,16,16,18,18,18,18,20,18,20,20,20,20]},"fused_nn_conv2d_3_kernel0":{"total_latency":48, "latency":[44,46,56,64,62,72,68,76,76,80,80,80,78,78,78,78,78,78,78,78]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":332, "latency":[250,350,402,530,650,768,884,998]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_kernel2":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16,16,18,18,18,18,18]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":176, "latency":[174,310]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_kernel1":{"total_latency":68, "latency":[62,66,70,74,80,88,102,116]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2_kernel1":{"total_latency":28, "latency":[18,20,22,24]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":500, "latency":[188,356]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_3_kernel1":{"total_latency":28, "latency":[18,20,24,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_kernel1":{"total_latency":72, "latency":[34,38,42,46,54,64,70,70]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":180, "latency":[176,284,408]},"fused_nn_conv2d_1_kernel0":{"total_latency":76, "latency":[52,74]},"fused_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,16]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_3_kernel0":{"total_latency":24, "latency":[18,22,22,24,26,32,32,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_2_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[16,14,16,16,16,18,18,18,18,18,18,20,20,20,20,22]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel1":{"total_latency":56, "latency":[34,36,42,46,54,66,70,70]},"fused_nn_conv2d_kernel0":{"total_latency":24, "latency":[16,18,18,18,20,20,20,22,24,24,26,26,28,28,28,30]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel2":{"total_latency":16, "latency":[16,16,16,18,18,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1_kernel2":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18,16,16,18,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,18,18,18,18,20,20,20,20]},"fused_add_10_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,14,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,20,22,24,26,28,30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_3_kernel2":{"total_latency":20, "latency":[16,18,22,18,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel1":{"total_latency":28, "latency":[18,20,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_2_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,18,18,18,20,20,20,20,22,22,22]},"fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,14,16,16,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_multiply_add_nn_re_11882905421691233276__kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,18,18,18,18,20,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1_kernel1":{"total_latency":48, "latency":[38,52]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_2_kernel1":{"total_latency":48, "latency":[36,48]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2_kernel0":{"total_latency":20, "latency":[18,20,22,24,26,28,30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1_kernel2":{"total_latency":16, "latency":[16,14,16,16,18,16,16,18,16,16,16,16,18,18]},"fused_nn_conv2d_2_kernel0":{"total_latency":28, "latency":[26,28,30,30]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel1":{"total_latency":48, "latency":[36,50]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,18,18,20,18,18,20,20,20,22,24,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_kernel2":{"total_latency":16, "latency":[14,14,14,16,16,16,16,18,16,16,18,18,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2_kernel2":{"total_latency":20, "latency":[16,18,18,20,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,18,18,18,18,18,20,20,22,20,22]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,18,18,18,18,18,20,22,20,20,22]},"fused_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1_kernel1":{"total_latency":56, "latency":[34,36,42,48,54,66,70,70]},"fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,18,18,18,20,20,20,20,22,22,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,16,16,18,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel2":{"total_latency":16, "latency":[14,14,16,14,16,16,16,16,16,16,16,16,18,18]}}} -------------------------------------------------------------------------------- /resource/vgg/vgg.profile.json: -------------------------------------------------------------------------------- 1 | {"model_latency":4824,"kernel_latency":{"tvmgen_default_fused_nn_batch_flatten_kernel0":{"total_latency":14, "latency":[14,14,14,14,14,14,14,14]},"tvmgen_default_fused_nn_global_avg_pool2d_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,16,16,16,16,16,16,16,16,16,16]},"tvmgen_default_fused_nn_conv2d_add_multiply_add_nn_relu_kernel0":{"total_latency":62, "latency":[62,74,80,88,92,96,110,112,108,110,106,104,104,104,104,104]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":164, "latency":[164,170,186,202,186,184,172,188,172,172,164,174,182,186,192,196,204,208,228,236,270,274,292,318,322,344,348,368,370,380,382,406]},"tvmgen_default_fused_add_nn_relu_kernel0":{"total_latency":14, "latency":[14,14,14,14,14,14,14,14]},"tvmgen_default_fused_nn_conv2d_3_kernel0":{"total_latency":140, "latency":[114,132,142,154,160,162,176,188,188,184,184,184,188,194,194,202,194,198,194,198,206,206,212,220,228,232,244,254]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":252, "latency":[254,248,262,270,292,324,324,324,322,322,322,324,326,326,324,330]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":120, "latency":[120,120,120,122,126,130,136,138,142,146,150,152,158,160,164,168,176,182,188,194,200,206,210,216]},"tvmgen_default_fused_nn_conv2d_add_add_nn_relu_kernel0":{"total_latency":40, "latency":[38,38,36,38,40,42,44,46]},"tvmgen_default_fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16]},"tvmgen_default_fused_nn_dense_add_kernel0":{"total_latency":60, "latency":[26,26,28,32,32,32,34,36,38,40,42,44,46,50,52,54,60,62,56,58,58,62,62,64,68,56,58,56,58,64,72,64]},"tvmgen_default_fused_nn_conv2d_add_1_kernel0":{"total_latency":40, "latency":[38,38,36,38,40,42,44,46]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":80, "latency":[78,82,88,100,110,120,130,142]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":204, "latency":[204,288,368,472,586,700,702,700]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":88, "latency":[60,90,124,160,194,230,230]},"tvmgen_default_fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":16, "latency":[16,16,22,18,18,18,18,20]},"tvmgen_default_fused_nn_conv2d_2_kernel0":{"total_latency":96, "latency":[92,82,90,98,108,118,130,140]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":28, "latency":[26,26,28,32]},"tvmgen_default_fused_add_kernel0":{"total_latency":14, "latency":[14,14,14,14,14,14,14,14]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":40, "latency":[36,48,60,76,90]},"tvmgen_default_fused_nn_conv2d_add_add_nn_relu_1_kernel0":{"total_latency":24, "latency":[20,22,24,24,24,24,24,24]},"tvmgen_default_fused_nn_conv2d_kernel0":{"total_latency":24, "latency":[24,28,30,34]},"tvmgen_default_fused_nn_conv2d_add_3_kernel0":{"total_latency":24, "latency":[24,24,28,30]},"tvmgen_default_fused_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16]},"tvmgen_default_fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":60, "latency":[60,60,60,58]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":36, "latency":[36,42,46,46,46,46,46,46]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":56, "latency":[54,58,66,76]},"tvmgen_default_fused_nn_conv2d_add_add_nn_relu_2_kernel0":{"total_latency":28, "latency":[26,26,30,32,36]},"tvmgen_default_fused_nn_softmax_kernel0":{"total_latency":28, "latency":[26]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":104, "latency":[104,164,232,302,372,442,442]},"tvmgen_default_fused_nn_conv2d_add_kernel0":{"total_latency":62, "latency":[62,74,80,88,96,106,122,112,106,108,104,106,106,106,104,106]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":84, "latency":[84,88,92,100,106,118,136,150]},"tvmgen_default_fused_nn_conv2d_1_kernel0":{"total_latency":144, "latency":[60,104,110,142]},"tvmgen_default_fused_nn_conv2d_add_2_kernel0":{"total_latency":24, "latency":[20,22,24,24,24,24,24,24]}}} -------------------------------------------------------------------------------- /script/best_effort_kernel.py: -------------------------------------------------------------------------------- 1 | import sys, json 2 | from transform_kernel import replace_global_with_device,replace_blockIdx_with_task_idx, find_all_func_params 3 | from transform_kernel import add_device_function_param,add_global_definition, generate_function_declaration 4 | 5 | 6 | def generate_global_wrappers(func_params): 7 | result = [] 8 | for func_name, func_param in func_params.items(): 9 | params = [ 10 | {"type": "int*", "name": "preempted"}, 11 | {"type": "int*", "name": "task_slot"}, 12 | ] 13 | params.extend(func_param) 14 | params_name = [] 15 | params_type_name = [] 16 | for param in func_param: 17 | params_name.append(param["name"]) 18 | for param in params: 19 | params_type_name.append(param["type"] + " " + param["name"]) 20 | params_def = ", ".join(params_type_name) 21 | params_call = ", ".join(params_name) 22 | func_template = """ 23 | extern "C" __global__ void {func_name}({params_def}) {{ 24 | if (*preempted) return; 25 | {func_name}_device({params_call}); 26 | if (threadIdx.x + threadIdx.y + threadIdx.z == 0) 27 | atomicAdd(task_slot, 1); 28 | }} 29 | """.format( 30 | func_name = func_name, 31 | params_def = params_def, 32 | params_call = params_call 33 | ) 34 | result.extend(func_template.splitlines(True)) 35 | return result 36 | 37 | if __name__ == "__main__": 38 | 39 | if len(sys.argv) != 2: 40 | print("Usage: python best_effort_kernel.py input_file.cu") 41 | exit(0) 42 | 43 | f = open(sys.argv[1], "r") 44 | lines = f.readlines() 45 | f.close() 46 | 47 | 48 | func_params = find_all_func_params(lines) 49 | lines = replace_global_with_device(lines) 50 | lines.extend(generate_global_wrappers(func_params)) 51 | 52 | output_f_name = sys.argv[1][:sys.argv[1].rfind(".")] + ".be.cu" 53 | 54 | f = open(output_f_name, "w") 55 | f.writelines(lines) 56 | f.close() 57 | -------------------------------------------------------------------------------- /script/estimate_max_throughput.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | import math 4 | 5 | # Steps: 6 | # 1. profile the kernel: rocprof --hsa-trace ./raw_executor code_object schedule.json 1 7 | # 2. delete the first warm up profile result. 8 | # 3. use this file to estimate the potential throughput improvement. 9 | 10 | 11 | if len(sys.argv) != 2: 12 | print("Usage: python3 estimate_max_throughput.py data.csv") 13 | exit(1) 14 | 15 | num_cu = 80 16 | 17 | df = pd.read_csv(sys.argv[1]) 18 | df = df[["grd", "wgr", "DurationNs"]] 19 | df['blocks'] = df['grd'] // df['wgr'] 20 | 21 | df['cus'] = df['blocks'].map(lambda x: num_cu - math.ceil(x / math.ceil((x / num_cu)))) 22 | df['remain'] = df['cus'] * df['DurationNs'] 23 | total = df['DurationNs'].sum() * num_cu 24 | remain = df['remain'].sum() 25 | 26 | print("%f%%" % (remain / total * 100)) -------------------------------------------------------------------------------- /script/estimate_resource_usage.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | import math 4 | import subprocess 5 | import json 6 | 7 | 8 | if len(sys.argv) < 2: 9 | print("Usage: python3 estimate_resource_usage.py model_dir") 10 | exit(1) 11 | 12 | for i in range(1, len(sys.argv)): 13 | model_dir = sys.argv[i] 14 | if (model_dir.find("Makefile") != -1): 15 | continue 16 | _, model_name = subprocess.getstatusoutput("basename " + model_dir) 17 | 18 | model_profile = json.loads(open(model_dir + "/" + model_name + ".profile.json", "r").read()) 19 | model_schedule = json.loads(open(model_dir + "/" + model_name + ".json", "r").read()) 20 | 21 | num_cu = 60 22 | 23 | total = 0 24 | used = 0 25 | 26 | for kernel_info in model_schedule["kernels"]: 27 | blocks = kernel_info["launch_params"][0] * kernel_info["launch_params"][1] * kernel_info["launch_params"][2] 28 | cus = math.ceil(blocks / math.ceil((blocks / num_cu))) 29 | latency = model_profile[kernel_info["name"]]["total_latency"] 30 | total += latency * num_cu 31 | used += latency * cus 32 | 33 | print("%s: %f%%" % (model_name, used / total * 100)) -------------------------------------------------------------------------------- /script/generate_final_schedule.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | ############################################################################ 5 | # 6 | # This script is used to combine the two json files generated from TVM 7 | # to a final schedule json file which can be used by the dnn_rt_scheduler. 8 | # 9 | # Usage: python generate_schedule.py raw_schedule_file.json graph_json.json 10 | # 11 | # The first json file(raw_schedule_file.json) is generated by TVM runtime, 12 | # which contains the basic kernel schedule and kernel parameters. 13 | # 14 | # The second json file(graph_json.json) is generated by TVM backend(modified version), 15 | # which contains host function information and device stroage information. 16 | # 17 | ############################################################################ 18 | def generate_final_schedule(source_code_lines, schedule_raw, graph): 19 | def split_function_declaration(line): 20 | parts = line.split("(") 21 | parameters_str = parts[1].split(")")[0] 22 | left_parts = parts[0].split(" ") 23 | name = left_parts[-1] 24 | return_type = left_parts[-2] 25 | header = " ".join(left_parts[:-2]) 26 | parameter_str_list = parameters_str.split(", ") 27 | parameters = [] 28 | for param_str in parameter_str_list: 29 | parts = param_str.split(" ") 30 | param_name = parts[-1] 31 | param_type = " ".join(parts[:-1]) 32 | parameters.append({"name": param_name, "type": param_type}) 33 | return header, return_type, name, parameters 34 | # 1. storage info from graph_json 35 | 36 | storage_id = graph["attrs"]["storage_id"][1] 37 | ## FIXME: a hack here 38 | ## to avoid buffer reuse, we replace storage_id to itself. 39 | for i in range(len(storage_id)): 40 | storage_id[i] = i 41 | 42 | storage = [] 43 | for i in range(max(storage_id) + 1): 44 | storage.append({"name": "null", "size": 0, "stype": "null"}) 45 | 46 | arg_idx = [] 47 | 48 | for i in range(len(storage_id)): 49 | shape = graph["attrs"]["shape"][1][i] 50 | t = graph["attrs"]["dltype"][1][i] 51 | size = 1 52 | for j in shape: 53 | size = size * j 54 | sid = storage_id[i] 55 | if storage[sid]["size"] < size: 56 | storage[sid]["size"] = size 57 | storage[sid]["stype"] = t 58 | 59 | for a in graph["arg_nodes"]: 60 | sid = storage_id[a] 61 | name = graph["nodes"][a]["name"] 62 | storage[sid]["name"] = name 63 | arg_idx.append(sid) 64 | 65 | # 2. append dynamic allocated storage 66 | temp_storage_begin = len(storage) 67 | for temp_arg in schedule_raw["temp_args"]: 68 | storage.append({"name": "temp_arg", "size": temp_arg, "stype": "byte"}) 69 | 70 | # 3. remap the kernel args 71 | i = 0 72 | kernels = [] 73 | node_row_ptr = graph["node_row_ptr"] 74 | for j in range(len(graph["nodes"])): 75 | node = graph["nodes"][j] 76 | if node["op"] == "null": 77 | continue 78 | if node["attrs"]["func_name"] == "__nop": 79 | continue 80 | 81 | schedule_func = schedule_raw["funcs"][i] 82 | while len(schedule_func["kernels"]) == 0: 83 | i = i + 1 84 | schedule_func = schedule_raw["funcs"][i] 85 | 86 | if schedule_func["name"] != node["attrs"]["func_name"]: 87 | raise Exception("schedule name != node name, %s != %s" % (schedule_func["name"],node["name"])) 88 | # if node["attrs"]["num_outputs"] != "1": 89 | # print(node["attrs"]["num_outputs"]) 90 | # raise Exception("node output != 1") 91 | host_inputs = [] 92 | for inp in node["inputs"]: 93 | host_inputs.append(node_row_ptr[inp[0]]+inp[1]) 94 | for idx in range(int(node["attrs"]["num_outputs"])): 95 | host_inputs.append(node_row_ptr[j]+idx) 96 | for kernel in schedule_func["kernels"]: 97 | new_args = [] 98 | for arg in kernel["args"]: 99 | if arg < 0: 100 | new_args.append(temp_storage_begin-arg-1) 101 | else: 102 | new_args.append(storage_id[host_inputs[arg]]) 103 | kernels.append({"name": kernel["name"], "launch_params": kernel["launch_params"], "args": new_args}) 104 | i = i+1 105 | 106 | output_idx = graph["heads"][0][0] 107 | storage[storage_id[output_idx]]["name"] = "output" 108 | 109 | schedule = { 110 | "storage": storage, 111 | "kernels": kernels, 112 | "args": arg_idx 113 | } 114 | 115 | # 4. generate shared memory usage 116 | 117 | func_name = "" 118 | shared_memory = 0 119 | 120 | result = {} 121 | 122 | for line in source_code_lines: 123 | if line.find("void") != -1: 124 | # save old values 125 | if func_name != "": 126 | if shared_memory < 4: 127 | shared_memory = 4 128 | result[func_name] = shared_memory 129 | 130 | _, _, curr_func_name, _ = split_function_declaration(line) 131 | func_name = curr_func_name 132 | shared_memory = 4 133 | if line.find("__shared__") != -1: 134 | # __shared__ float x[123]; 135 | size = line.split("[")[1].split("]")[0] 136 | shared_memory = shared_memory + int(size) * 4 137 | 138 | if func_name != "": 139 | if shared_memory < 4: 140 | shared_memory = 4 141 | result[func_name] = shared_memory 142 | 143 | schedule["shared_memory"] = result 144 | return schedule 145 | 146 | 147 | if __name__ == "__main__": 148 | if len(sys.argv) != 4: 149 | print("Useage: source_code raw_scheduler_file graph_json_file ") 150 | exit(0) 151 | 152 | f = open(sys.argv[1], "r") 153 | source_code_lines = f.readlines() 154 | f.close() 155 | 156 | f = open(sys.argv[2], "r") 157 | schedule_raw = json.loads(f.read()) 158 | f.close() 159 | 160 | f = open(sys.argv[3], "r") 161 | graph = json.loads(f.read()) 162 | f.close() 163 | 164 | schedule = generate_final_schedule(source_code_lines, schedule_raw, graph) 165 | 166 | print(json.dumps(schedule, indent = 4)) 167 | 168 | -------------------------------------------------------------------------------- /script/generate_register_hint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | def split_function_declaration(line): 5 | parts = line.strip().split("void") 6 | header = parts[0] 7 | return_type = "void" 8 | right_parts = parts[1].split("(") 9 | name = right_parts[0] 10 | parameters_str = right_parts[1].split(")")[0] 11 | parameter_str_list = parameters_str.split(", ") 12 | parameters = [] 13 | for param_str in parameter_str_list: 14 | parts = param_str.split(" ") 15 | param_name = parts[-1] 16 | param_type = " ".join(parts[:-1]) 17 | parameters.append({"name": param_name, "type": param_type}) 18 | print(name) 19 | return header, return_type, name, parameters 20 | 21 | def generate_function_declaration(return_type, name, params): 22 | params_str_list = [] 23 | for param in params: 24 | param_str = param["type"] + " " + param["name"] 25 | params_str_list.append(param_str) 26 | return return_type + " " + name + "(" + ", ".join(params_str_list) + ")" 27 | 28 | 29 | src = open(sys.argv[1], "r") 30 | lines = src.readlines() 31 | 32 | schedule = json.loads(open(sys.argv[2], "r").read()) 33 | 34 | new_lines = [] 35 | 36 | kernel_info = {} 37 | 38 | for kernel in schedule["kernels"]: 39 | kernel_info[kernel["name"]] = kernel["launch_params"][3] * kernel["launch_params"][4] * kernel["launch_params"][5] 40 | 41 | for line in lines: 42 | if line.find("__global__") != -1: 43 | _, _, func_name, params = split_function_declaration(line.strip()) 44 | func_name = func_name.strip() 45 | if func_name in kernel_info: 46 | new_func = line.replace("void", "__attribute__((amdgpu_flat_work_group_size(%d, %d))) void" % (kernel_info[func_name], kernel_info[func_name])) 47 | new_lines.append(new_func) 48 | continue 49 | new_lines.append(line) 50 | 51 | f = open(sys.argv[1], "w") 52 | f.writelines(new_lines) 53 | f.close() 54 | -------------------------------------------------------------------------------- /script/generate_shared_memory_usage.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | ############################################################ 5 | # 6 | # This script is used to insert shared memory usage into the 7 | # model schedule json file. 8 | # 9 | # The shared memory usage is extracted from the device source 10 | # code. 11 | ############################################################ 12 | 13 | if len(sys.argv) != 3: 14 | print("Usage: source_code.cpp schedule_file.json") 15 | exit(0) 16 | 17 | 18 | def split_function_declaration(line): 19 | parts = line.split("(") 20 | parameters_str = parts[1].split(")")[0] 21 | left_parts = parts[0].split(" ") 22 | name = left_parts[-1] 23 | return_type = left_parts[-2] 24 | header = " ".join(left_parts[:-2]) 25 | parameter_str_list = parameters_str.split(", ") 26 | parameters = [] 27 | for param_str in parameter_str_list: 28 | parts = param_str.split(" ") 29 | param_name = parts[-1] 30 | param_type = " ".join(parts[:-1]) 31 | parameters.append({"name": param_name, "type": param_type}) 32 | return header, return_type, name, parameters 33 | 34 | 35 | 36 | source_code_lines = open(sys.argv[1], "r").readlines() 37 | schedule = json.loads(open(sys.argv[2], "r").read()) 38 | 39 | func_name = "" 40 | shared_memory = 0 41 | 42 | result = {} 43 | 44 | for line in source_code_lines: 45 | if line.find("void") != -1: 46 | # save old values 47 | if func_name != "": 48 | if shared_memory < 4: 49 | shared_memory = 4 50 | result[func_name] = shared_memory 51 | 52 | _, _, curr_func_name, _ = split_function_declaration(line) 53 | func_name = curr_func_name 54 | shared_memory = 0 55 | if line.find("__shared__") != -1: 56 | # __shared__ float x[123]; 57 | size = line.split("[")[1].split("]")[0] 58 | shared_memory = shared_memory + int(size) * 4 59 | 60 | if func_name != "": 61 | if shared_memory < 4: 62 | shared_memory = 4 63 | result[func_name] = shared_memory 64 | 65 | schedule["shared_memory"] = result 66 | 67 | old_file_name = sys.argv[2].split(".json")[-2] 68 | new_file_name = old_file_name + "_sm.json" 69 | f = open(new_file_name, "w") 70 | f.write(json.dumps(schedule)) 71 | f.close() -------------------------------------------------------------------------------- /script/get_kernel_descriptor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | f = open(sys.argv[1], "r") 5 | lines = f.readlines() 6 | 7 | descriptors = {} 8 | 9 | is_descriptor = False 10 | 11 | for line in lines: 12 | if line.find(".amdhsa_kernel ") != -1: 13 | is_descriptor = True 14 | continue 15 | if line.find(".end_amdhsa_kernel") != -1: 16 | is_descriptor = False 17 | continue 18 | if is_descriptor == False: 19 | continue 20 | parts = line.strip().split(" ") 21 | key = parts[0] 22 | value = parts[1] 23 | 24 | if key in descriptors: 25 | values = descriptors[key] 26 | values.append(value) 27 | descriptors[key] = list(set(values)) 28 | else: 29 | values = [] 30 | values.append(value) 31 | descriptors[key] = values 32 | 33 | 34 | print(json.dumps(descriptors, sort_keys=True, indent=4)) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /script/get_kernel_occupancy.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if len(sys.argv) != 3: 4 | print("Usage: python3 get_kernel_occupancy.py raw.s trans.s") 5 | exit(1) 6 | 7 | raw_asm = open(sys.argv[1], "r") 8 | trans_asm = open(sys.argv[2], "r") 9 | 10 | raw_lines = raw_asm.readlines() 11 | trans_lines = trans_asm.readlines() 12 | 13 | def get_occupancy(lines): 14 | current_kernel = "" 15 | occupancy = {} 16 | stack_size = {} 17 | for line in lines: 18 | if line.find(".amdhsa_kernel") != -1: 19 | kernel_name = line.strip().split(" ")[-1] 20 | current_kernel = kernel_name 21 | if line.find("Occupancy") != -1: 22 | occupancy[current_kernel] = int(line.strip().split(" ")[-1]) 23 | if line.find("ScratchSize") != -1: 24 | stack_size[current_kernel] = int(line.strip().split(" ")[-1]) 25 | 26 | return occupancy, stack_size 27 | 28 | 29 | raw_occupancy, raw_stack = get_occupancy(raw_lines) 30 | trans_occupancy, trans_stack = get_occupancy(trans_lines) 31 | 32 | print("Occupancy:") 33 | for kernel, occupancy in raw_occupancy.items(): 34 | if occupancy > trans_occupancy[kernel]: 35 | print("%s: %d, %d" %(kernel, occupancy, trans_occupancy[kernel])) 36 | 37 | print("Stack:") 38 | for kernel, stack in raw_stack.items(): 39 | if stack < trans_stack[kernel]: 40 | print("%s: %d, %d" %(kernel, stack, trans_stack[kernel])) -------------------------------------------------------------------------------- /script/replace_raw_occupancy.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | if len(sys.argv) != 3: 5 | print("Usage: python3 replace_raw_occupancy.py raw_source.cu trans_asm.s") 6 | 7 | def get_occupancy(lines): 8 | current_kernel = "" 9 | occupancy = {} 10 | stack_size = {} 11 | for line in lines: 12 | if line.find(".amdhsa_kernel") != -1: 13 | kernel_name = line.strip().split(" ")[-1] 14 | current_kernel = kernel_name 15 | if line.find("Occupancy") != -1: 16 | occupancy[current_kernel] = int(line.strip().split(" ")[-1]) 17 | if line.find("ScratchSize") != -1: 18 | stack_size[current_kernel] = int(line.strip().split(" ")[-1]) 19 | 20 | return occupancy, stack_size 21 | 22 | asm = open(sys.argv[2], "r") 23 | asm_lines = asm.readlines() 24 | asm.close() 25 | 26 | asm_occupancy, _ = get_occupancy(asm_lines) 27 | 28 | raw_source = open(sys.argv[1], "r") 29 | raw_lines = raw_source.readlines() 30 | raw_source.close() 31 | 32 | new_lines = [] 33 | 34 | for line in raw_lines: 35 | if line.find('__global__') != -1: 36 | parts = line.split("void") 37 | right_part = parts[1] 38 | func_name = right_part.split("(")[0].strip() 39 | left_part = 'extern "C" __global__ __attribute__((amdgpu_waves_per_eu(%d,%d))) void ' % (asm_occupancy[func_name], asm_occupancy[func_name]) 40 | new_line = left_part + right_part 41 | new_lines.append(new_line) 42 | continue 43 | new_lines.append(line) 44 | 45 | raw_source = open(sys.argv[1], "w") 46 | raw_source.writelines(new_lines) 47 | raw_source.close() 48 | 49 | -------------------------------------------------------------------------------- /script/replace_register_usage.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if len(sys.argv) != 2: 4 | print("Usage: python3 replace_register_usage.py source.asm") 5 | exit(0) 6 | 7 | f = open(sys.argv[1], "r") 8 | lines = f.readlines() 9 | 10 | # collect max private segment size 11 | max_private_segment = 0 12 | for line in lines: 13 | if line.find(".amdhsa_private_segment_fixed_size") != -1: 14 | parts = line.strip().split(" ") 15 | key = parts[0] 16 | value = int(parts[1]) 17 | if value > max_private_segment: 18 | max_private_segment = value 19 | 20 | need_private_segment = 0 21 | if max_private_segment > 0: 22 | need_private_segment = 1 23 | 24 | 25 | max_sgprs_per_simd = 800 26 | max_vgprs_per_smid = 16 * 1024 27 | 28 | max_sgprs_per_wave = 102 29 | max_vgprs_per_wave = 256 30 | 31 | sgpr_block = 8 32 | 33 | sgprs_layers = [ 34 | 0, 35 | 102, 36 | 102, 37 | 102, 38 | 102, 39 | 102, 40 | 102, 41 | 102, 42 | 102, 43 | 88, 44 | 80 45 | ] 46 | 47 | vgprs_layers = [ 48 | 0, 49 | 256, 50 | 128, 51 | 84, 52 | 64, 53 | 48, 54 | 40, 55 | 36, 56 | 32, 57 | 28, 58 | 28 59 | ] 60 | 61 | 62 | 63 | def replace_text_segment_param(lines, key, value, values, key_word="_framework_"): 64 | new_lines = [] 65 | 66 | current_kernel = "" 67 | 68 | for line in lines: 69 | # replace amdhas_next_free_vgpr 70 | if line.find(".amdhsa_kernel") != -1: 71 | kernel_name = line.strip().split(" ")[-1] 72 | current_kernel = kernel_name 73 | new_lines.append(line) 74 | elif line.find(key) != -1 and current_kernel.find(key_word) != -1 and current_kernel.split(key_word)[-1].isnumeric(): 75 | num_layers = int(current_kernel.split(key_word)[-1]) 76 | new_value= "" 77 | if value == None and values == None: 78 | # print("remove %s %s" % (current_kernel, key)) 79 | continue 80 | if value != None: 81 | new_value = value 82 | else: 83 | new_value = values[num_layers] 84 | new_line = " %s %d\n" % (key, new_value) 85 | # print("replace %s %s to %d" % (current_kernel, key, new_value)) 86 | new_lines.append(new_line) 87 | else: 88 | new_lines.append(line) 89 | return new_lines 90 | 91 | def replace_symbol_segment_param(lines, key, value, values, key_word="_framework_"): 92 | new_lines = [] 93 | 94 | current_kernel = "" 95 | 96 | for line in lines: 97 | # replace amdhas_next_free_vgpr 98 | if line.find(".name:") != -1: 99 | kernel_name = line.strip().split(" ")[-1] 100 | current_kernel = kernel_name 101 | new_lines.append(line) 102 | elif line.find(key) != -1 and current_kernel.find(key_word) != -1 and current_kernel.split(key_word)[-1].isnumeric(): 103 | num_layers = int(current_kernel.split(key_word)[-1]) 104 | new_value= "" 105 | if value != None: 106 | new_value = value 107 | else: 108 | new_value = values[num_layers] 109 | new_line = " %s %d\n" % (key, new_value) 110 | # print("replace %s %s to %d" % (current_kernel, key, new_value)) 111 | new_lines.append(new_line) 112 | else: 113 | new_lines.append(line) 114 | return new_lines 115 | 116 | def batch_replace(lines, key_word, private_segment): 117 | lines = replace_text_segment_param(lines, ".amdhsa_next_free_vgpr", None, vgprs_layers, key_word) 118 | lines = replace_text_segment_param(lines, ".amdhsa_next_free_sgpr", None, sgprs_layers, key_word) 119 | if private_segment: 120 | lines = replace_text_segment_param(lines, ".amdhsa_private_segment_fixed_size", max_private_segment, None, key_word) 121 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_private_segment_wavefront_offset", need_private_segment, None, key_word) 122 | # lines = replace_text_segment_param(lines, ".amdhsa_user_sgpr_flat_scratch_init", 0, None) 123 | lines = replace_text_segment_param(lines, ".amdhsa_user_sgpr_dispatch_ptr", 0, None, key_word) 124 | lines = replace_text_segment_param(lines, ".amdhsa_system_vgpr_workitem_id", 0, None, key_word) 125 | lines = replace_text_segment_param(lines, ".amdhsa_reserve_flat_scratch", None, None, key_word) 126 | lines = replace_text_segment_param(lines, ".amdhsa_reserve_vcc", None, None, key_word) 127 | 128 | lines = replace_symbol_segment_param(lines, ".vgpr_count:", None, vgprs_layers, key_word) 129 | lines = replace_symbol_segment_param(lines, ".sgpr_count:", None, sgprs_layers, key_word) 130 | if private_segment: 131 | lines = replace_symbol_segment_param(lines, ".private_segment_fixed_size:", max_private_segment, None, key_word) 132 | 133 | return lines 134 | 135 | lines = batch_replace(lines, "merge_framework_", True) 136 | lines = batch_replace(lines, "call_framework_", True) 137 | lines = batch_replace(lines, "merge_framework_nostack_", False) 138 | lines = batch_replace(lines, "proxy_kernel_", True) 139 | lines = batch_replace(lines, "proxy_kernel_nostack_", False) 140 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_y", 1, None, "proxy_kernel_") 141 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_z", 1, None, "proxy_kernel_") 142 | lines = replace_text_segment_param(lines, ".amdhsa_system_vgpr_workitem_id", 2, None, "proxy_kernel_") 143 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_y", 1, None, "proxy_kernel_nostack_") 144 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_z", 1, None, "proxy_kernel_nostack_") 145 | lines = replace_text_segment_param(lines, ".amdhsa_system_vgpr_workitem_id", 2, None, "proxy_kernel_nostack_") 146 | f.close() 147 | f = open(sys.argv[1], "w") 148 | f.writelines(lines) 149 | f.close() 150 | # replace vgpr_count -------------------------------------------------------------------------------- /script/tvm_generate_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from tvm import relay 4 | from tvm.relay import testing 5 | import tvm 6 | from tvm import te 7 | from tvm.contrib import graph_runtime 8 | import sys 9 | import json 10 | 11 | ##################################################### 12 | # 13 | # This is an example of how to generate source code 14 | # and schedule json from tvm. 15 | # 16 | ##################################################### 17 | 18 | 19 | if len(sys.argv) != 5: 20 | print("Usage: device_source_file_name raw_schedule_file graph_json_file param_file") 21 | exit(0) 22 | 23 | source_file = open(sys.argv[1], "w") 24 | raw_schedule_file = open(sys.argv[2], "w") 25 | graph_json_file = open(sys.argv[3], "w") 26 | param_file = open(sys.argv[4], "w+b") 27 | 28 | batch_size = 1 29 | num_class = 1000 30 | image_shape = (3, 224, 224) 31 | data_shape = (batch_size,) + image_shape 32 | out_shape = (batch_size, num_class) 33 | 34 | mod, params = relay.testing.resnet.get_workload( 35 | num_layers=18, batch_size=batch_size, image_shape=image_shape 36 | ) 37 | # mod, params = relay.testing.mobilenet.get_workload( 38 | # batch_size=batch_size, image_shape=image_shape 39 | # ) 40 | 41 | opt_level = 3 42 | target = tvm.target.rocm() 43 | 44 | with tvm.transform.PassContext(opt_level=opt_level): 45 | lib = relay.build(mod, target, params=params) 46 | 47 | ctx = tvm.rocm() 48 | module = graph_runtime.GraphModule(lib["default"](ctx)) 49 | 50 | data = np.ones(data_shape).astype("float32") 51 | data = data * 10 52 | module.set_input("data", data) 53 | 54 | module.run() 55 | 56 | source_file.write(lib.get_lib().imported_modules[0].get_source("hip")) 57 | source_file.close() 58 | 59 | graph_json_file.write(lib.get_json()) 60 | graph_json_file.close() 61 | 62 | raw_schedule_file.write(module.module["get_schedule_json"]()) 63 | raw_schedule_file.close() 64 | 65 | 66 | def dump_params(params, f): 67 | import array 68 | magic = bytes("TVM_MODEL_PARAMS\0", "ascii") 69 | f.write(magic) 70 | f.write(array.array('Q',[len(params)]).tobytes()) 71 | for k in params.keys(): 72 | param = array.array('f', params[k].asnumpy().flatten().tolist()) 73 | f.write(bytes(k, "ascii")) 74 | f.write(bytes("\0", "ascii")) 75 | f.write(array.array('Q',[len(param)]).tobytes()) 76 | f.write(param.tobytes()) 77 | 78 | dump_params(params, param_file) 79 | param_file.close() 80 | 81 | out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy() 82 | print(out.flatten()[0:10]) -------------------------------------------------------------------------------- /src/example/rpc_client.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/client/client.h" 2 | 3 | int main(int argc, char** argv) { 4 | if (argc != 4) { 5 | std::cerr << "Usage: " << std::string(argv[0]) << " model_dir model_name [real_time]\n"; 6 | std::cerr << "Example: " << std::string(argv[0]) << " reef/resource/resnet18 resnet18 1\n"; 7 | return -1; 8 | } 9 | 10 | std::string model_dir(argv[1]); 11 | std::string model_name(argv[2]); 12 | int real_time = std::atoi(argv[3]); 13 | 14 | 15 | reef::client::REEFClient client(DEFAULT_REEF_ADDR); 16 | ASSERT(client.init(real_time)); // whether this client send real-time requests? 17 | 18 | std::cout << "loading '" << model_name << "' from " << "'"<< model_dir << "'\n"; 19 | auto model = client.load_model(model_dir, model_name); 20 | ASSERT(model.get() != nullptr); 21 | 22 | // Get or set the input/output data. 23 | // auto input_blob = model->get_input_blob(); 24 | // model->load_input(); 25 | // auto output_blob = model->get_output_blob(); 26 | // auto output = model->get_output(); 27 | 28 | std::cout << "submit inference request\n"; 29 | 30 | auto task = model->infer(); // submit an inference request 31 | std::cout << "inference latency: " << std::chrono::duration_cast(task.finish - task.submit).count() / 1000.0 << " ms\n"; 32 | 33 | return 0; 34 | } -------------------------------------------------------------------------------- /src/example/rpc_client_cont.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/client/client.h" 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | if (argc != 5) { 6 | std::cerr << "Usage: " << std::string(argv[0]) << " model_dir model_name real_time sleep_time(ms)\n"; 7 | std::cerr << "Example: " << std::string(argv[0]) << " reef/resource/resnet18 resnet18 1 10\n"; 8 | return -1; 9 | } 10 | 11 | std::string model_dir(argv[1]); 12 | std::string model_name(argv[2]); 13 | int real_time = std::atoi(argv[3]); 14 | int sleep_time = std::atoi(argv[4]); 15 | 16 | 17 | reef::client::REEFClient client(DEFAULT_REEF_ADDR); 18 | ASSERT(client.init(real_time)); // whether this client send real-time requests? 19 | 20 | std::cout << "loading '" << model_name << "' from " << "'"<< model_dir << "'\n"; 21 | auto model = client.load_model(model_dir, model_name); 22 | ASSERT(model.get() != nullptr); 23 | 24 | // Get or set the input/output data. 25 | // auto input_blob = model->get_input_blob(); 26 | // model->load_input(); 27 | // auto output_blob = model->get_output_blob(); 28 | // auto output = model->get_output(); 29 | 30 | std::cout << "submit inference requests\n"; 31 | while (true) { 32 | auto task = model->infer(); // submit an inference request 33 | std::cout << "client " << model->get_mid() << " inference latency: " << std::chrono::duration_cast(task.finish - task.submit).count() / 1000.0 << " ms\n"; 34 | std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); 35 | } 36 | 37 | return 0; 38 | } -------------------------------------------------------------------------------- /src/example/rpc_server.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/server/server.h" 2 | 3 | int main(int argc, char** argv) { 4 | reef::server::REEFServer server(DEFAULT_REEF_ADDR); 5 | server.run(); 6 | server.wait(); 7 | return 0; 8 | } -------------------------------------------------------------------------------- /src/reef/client/client.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/client/client.h" 2 | 3 | #include 4 | 5 | namespace reef { 6 | namespace client { 7 | 8 | REEFClient::REEFClient(const std::string &server_addr) : rpc_client(nullptr) { 9 | LOG(INFO) << "Create REEFClient to " << server_addr; 10 | 11 | rpc_client = REEFService::NewStub( 12 | grpc::CreateChannel(server_addr, grpc::InsecureChannelCredentials()) 13 | ); 14 | 15 | ASSERT_MSG(rpc_client.get() != nullptr, "cannot create rpc client"); 16 | LOG(INFO) << "Create REEFClient succeeds"; 17 | } 18 | 19 | bool REEFClient::init(bool real_time) { 20 | // set client (task queue) priority 21 | grpc::ClientContext ctx; 22 | reef::rpc::SetPriorityRequest request; 23 | reef::rpc::SetPriorityReply reply; 24 | request.set_rt(real_time); 25 | auto status = rpc_client->SetPriority(&ctx, request, &reply); 26 | ASSERT_MSG(status.ok(), status.error_message()); 27 | ASSERT(reply.succ()); 28 | qid = reply.qid(); 29 | return true; 30 | } 31 | 32 | std::shared_ptr REEFClient::load_model( 33 | const std::string& model_dir, 34 | const std::string& name 35 | ) { 36 | grpc::ClientContext ctx; 37 | reef::rpc::LoadModelRequest request; 38 | reef::rpc::LoadModelReply reply; 39 | LOG(INFO) << "Loading model " << name; 40 | request.set_dir(model_dir); 41 | request.set_name(name); 42 | request.set_qid(qid); 43 | auto status = rpc_client->LoadModel(&ctx, request, &reply); 44 | ASSERT_MSG(status.ok(), status.error_message()); 45 | ASSERT(reply.succ()); 46 | std::shared_ptr model = 47 | std::make_shared( 48 | rpc_client, reply.mid(), model_dir, name 49 | ); 50 | { 51 | std::unique_lock lock(models_mtx); 52 | models.push_back(model); 53 | } 54 | return model; 55 | } 56 | 57 | ModelHandle::ModelHandle( 58 | const std::shared_ptr& _rpc_client, 59 | int32_t _mid, 60 | const std::string& _dir, 61 | const std::string& _name 62 | ) : rpc_client(_rpc_client), mid(_mid), dir(_dir), name(_name) { 63 | 64 | } 65 | 66 | // submit an inference task. wait for completion. 67 | TaskHandle ModelHandle::infer() { 68 | grpc::ClientContext ctx; 69 | reef::rpc::InferRequest request; 70 | reef::rpc::InferReply reply; 71 | request.set_mid(mid); 72 | TaskHandle t; 73 | t.submit = std::chrono::system_clock::now(); 74 | auto status = rpc_client->Infer(&ctx, request, &reply); 75 | ASSERT_MSG(status.ok(), status.error_message()); 76 | ASSERT(reply.succ()); 77 | t.finish = std::chrono::system_clock::now(); 78 | t.tid = reply.tid(); 79 | return t; 80 | } 81 | 82 | // submit an asynchronous inference task. 83 | TaskHandle ModelHandle::infer_async() { 84 | return TaskHandle(); 85 | } 86 | 87 | // get the poniter of input shared memory. 88 | std::shared_ptr ModelHandle::get_input_blob(const std::string& name) { 89 | if (input_blob.get() == nullptr) 90 | input_blob = register_blob(name, input_blob_key); 91 | return input_blob; 92 | } 93 | 94 | // get the poniter of output shared memory. 95 | std::shared_ptr ModelHandle::get_output_blob(const std::string& name) { 96 | if (output_blob.get() == nullptr) { 97 | output_blob = register_blob(name, output_blob_key); 98 | } 99 | return output_blob; 100 | } 101 | 102 | std::shared_ptr ModelHandle::register_blob(const std::string& name, std::string& key) { 103 | grpc::ClientContext ctx; 104 | reef::rpc::RegisterBlobRequest request; 105 | reef::rpc::RegisterBlobReply reply; 106 | 107 | request.set_mid(mid); 108 | request.set_name(name); 109 | auto status = rpc_client->RegisterBlob(&ctx, request, &reply); 110 | ASSERT_MSG(status.ok(), status.error_message()); 111 | ASSERT(reply.succ()); 112 | 113 | std::shared_ptr shm = 114 | std::make_shared(reply.key(), reply.size()); 115 | key = reply.key(); 116 | return shm; 117 | } 118 | 119 | // load model input in REEF server. wait for completion. 120 | void ModelHandle::load_input() { 121 | ASSERT(input_blob.get() != nullptr); 122 | grpc::ClientContext ctx; 123 | reef::rpc::SetBlobRequest request; 124 | reef::rpc::SetBlobReply reply; 125 | request.set_key(input_blob_key); 126 | auto status = rpc_client->SetBlob(&ctx, request, &reply); 127 | ASSERT_MSG(status.ok(), status.error_message()); 128 | ASSERT(reply.succ()); 129 | } 130 | 131 | // load model output in REEF server. wait for completion. 132 | void ModelHandle::get_output() { 133 | ASSERT(output_blob.get() != nullptr); 134 | grpc::ClientContext ctx; 135 | reef::rpc::GetBlobRequest request; 136 | reef::rpc::GetBlobReply reply; 137 | request.set_key(output_blob_key); 138 | auto status = rpc_client->GetBlob(&ctx, request, &reply); 139 | ASSERT_MSG(status.ok(), status.error_message()); 140 | ASSERT(reply.succ()); 141 | } 142 | 143 | int32_t ModelHandle::get_mid() const { 144 | return this->mid; 145 | } 146 | 147 | } // namespace client 148 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/client/client.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "reef/rpc/reef.grpc.pb.h" 4 | #include "reef/util/common.h" 5 | #include "reef/util/shared_memory.h" 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace reef { 14 | namespace client { 15 | 16 | using reef::rpc::REEFService; 17 | 18 | class TaskHandle { 19 | public: 20 | int32_t tid; 21 | std::chrono::system_clock::time_point submit, finish; 22 | }; 23 | 24 | // ModelHandle can be used to submit inference task. 25 | class ModelHandle { 26 | public: 27 | ModelHandle( 28 | const std::shared_ptr& rpc_client, 29 | int32_t _mid, 30 | const std::string& dir, 31 | const std::string& name 32 | ); 33 | // submit an inference task. wait for completion. 34 | TaskHandle infer(); 35 | 36 | // submit an asynchronous inference task. 37 | TaskHandle infer_async(); 38 | 39 | // get the poniter of input shared memory. 40 | std::shared_ptr get_input_blob(const std::string& name = "data"); 41 | 42 | // get the poniter of output shared memory. 43 | std::shared_ptr get_output_blob(const std::string& name = "output"); 44 | 45 | // load model input in REEF server. wait for completion. 46 | void load_input(); 47 | 48 | // load model output in REEF server. wait for completion. 49 | void get_output(); 50 | 51 | // TODO: FIXME 52 | void get_blob(); 53 | void set_blob(); 54 | 55 | int32_t get_mid() const; 56 | private: 57 | std::shared_ptr rpc_client; 58 | int32_t mid; 59 | std::string dir; 60 | std::string name; 61 | std::string input_blob_key, output_blob_key; 62 | std::shared_ptr input_blob; 63 | std::shared_ptr output_blob; 64 | 65 | private: 66 | std::shared_ptr register_blob(const std::string& name, std::string& key); 67 | 68 | }; 69 | 70 | 71 | // REEFClient is used to estabilish connection to REEF server 72 | // and load models into the server. 73 | class REEFClient { 74 | public: 75 | REEFClient(const std::string &server_addr); 76 | // initialize the client 77 | // Each client should be configured with a priority. 78 | // The real-time clients will share a RT task queue. 79 | // Each best-effort client will have its own BE task queue. 80 | bool init(bool real_time = false); 81 | 82 | // load a DNN model (in REEF server). 83 | std::shared_ptr load_model( 84 | const std::string& model_dir, 85 | const std::string& name 86 | ); 87 | 88 | private: 89 | std::shared_ptr rpc_client; 90 | std::mutex models_mtx; 91 | std::vector> models; 92 | int32_t qid; 93 | }; 94 | 95 | 96 | } // namespace client 97 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/executor_base.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/executor/executor_base.h" 2 | #include 3 | 4 | namespace reef { 5 | namespace executor{ 6 | 7 | ExecutorBase::ExecutorBase() { 8 | 9 | } 10 | 11 | ExecutorBase::~ExecutorBase() { 12 | // TODO: free GPU memory 13 | } 14 | 15 | Status ExecutorBase::load_model_from_file( 16 | const char* json_file_path, 17 | const char* co_file_path) 18 | { 19 | GPU_RETURN_STATUS(GPUInit(0)); 20 | // CUcontext ctx; 21 | GPUDevice_t device; 22 | GPU_RETURN_STATUS(GPUDeviceGet(&device, 0)); 23 | // GPU_RETURN_STATUS(cuCtxCreate(&ctx, 0, device)); 24 | GPUModule_t mod; 25 | GPU_RETURN_STATUS(GPUModuleLoad(&mod, co_file_path)); 26 | return this->load_model_from_GPU_module(json_file_path, mod); 27 | } 28 | 29 | Status ExecutorBase::load_model_from_GPU_module( 30 | const char* json_file_path, GPUModule_t mod) { 31 | return init_executor_base(json_file_path, mod); 32 | } 33 | 34 | 35 | Status ExecutorBase::init_executor_base( 36 | const char* json_file_path, 37 | GPUModule_t mod) 38 | { 39 | base_mod = mod; 40 | 41 | // 1. load json model file 42 | model.reset(Model::from_json(json_file_path)); 43 | if (model.get() == nullptr) RETURN_STATUS(Status::NotFound); 44 | 45 | // 2. load hip kernels 46 | for (KernelInfo &kernel_info : model->kernels) { 47 | GPUFunction_t kernel; 48 | GPU_RETURN_STATUS( 49 | GPUModuleGetFunction(&kernel, mod, kernel_info.name.c_str()) 50 | ); 51 | kernels.emplace(kernel_info.name, kernel); 52 | } 53 | 54 | // 3. allocate device storage 55 | for (StorageInfo &storage_info : model->storage) { 56 | size_t stype_size = Model::get_stype_size(storage_info.stype); 57 | size_t storage_size = stype_size * storage_info.size; 58 | GPUDevicePtr_t device_ptr; 59 | std::vector temp; 60 | temp.resize(storage_size, 0); 61 | GPU_RETURN_STATUS(GPUMalloc((GPUDevicePtr_t*)&device_ptr, storage_size)); 62 | GPU_RETURN_STATUS(GPUMemcpyHtoD(device_ptr, temp.data(), storage_size)); 63 | storage.push_back(device_ptr); 64 | } 65 | 66 | // 4. map args to storage 67 | raw_args.reserve(model->kernels.size()); 68 | for (KernelInfo &kernel_info : model->kernels) { 69 | std::vector kernel_arg; 70 | for (size_t arg_idx : kernel_info.args) { 71 | // assert(arg_idx < storage.size()); 72 | kernel_arg.push_back(&storage[arg_idx]); 73 | } 74 | raw_args.push_back(kernel_arg); 75 | } 76 | 77 | LOG(INFO) << "create base model stream"; 78 | GPU_RETURN_STATUS(hipStreamCreateWithWindowSize(&s, 16)); 79 | return Status::Succ; 80 | } 81 | 82 | Status ExecutorBase::load_param_from_file( 83 | const char* param_file_path) { 84 | std::unique_ptr params(ModelParamParser::parse_from_file(param_file_path)); 85 | for (size_t i = 0; i < storage.size(); i++) { 86 | StorageInfo& storage_info = this->model->storage[i]; 87 | if (params->find(storage_info.name) == params->end()) 88 | continue; 89 | auto &array = params->at(storage_info.name); 90 | GPU_RETURN_STATUS(GPUMemcpyHtoD( 91 | (GPUDevicePtr_t)storage[i], array.data(), 92 | array.size() * sizeof(float))); 93 | } 94 | return Status::Succ; 95 | } 96 | 97 | Status ExecutorBase::get_data_size(const std::string& key, size_t &size) { 98 | size_t input_storage_idx; 99 | if (find_storage_idx(key, input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound); 100 | StorageInfo& storage_info = this->model->storage[input_storage_idx]; 101 | size = Model::get_stype_size(storage_info.stype) * storage_info.size; 102 | return Status::Succ; 103 | } 104 | 105 | Status ExecutorBase::set_input( 106 | const std::string& key, const std::vector& value) { 107 | return set_input(key, (void*)value.data(), value.size() * sizeof(float)); 108 | } 109 | 110 | Status ExecutorBase::set_input(const std::string& key, const void* value, size_t len) { 111 | size_t input_storage_idx; 112 | if (find_storage_idx(key, input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound); 113 | StorageInfo& storage_info = this->model->storage[input_storage_idx]; 114 | size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size; 115 | if (len < storage_size) RETURN_STATUS(Status::OutOfRange); 116 | GPU_RETURN_STATUS(GPUMemcpyHtoD( 117 | (GPUDevicePtr_t)this->storage[input_storage_idx], (void*)value, 118 | storage_size) 119 | ); 120 | return Status::Succ; 121 | } 122 | 123 | Status ExecutorBase::set_input(int idx, const void* value, size_t len) { 124 | if (idx >= storage.size()) RETURN_STATUS(Status::OutOfRange); 125 | StorageInfo& storage_info = this->model->storage[idx]; 126 | size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size; 127 | if (len < storage_size) RETURN_STATUS(Status::OutOfRange); 128 | GPU_RETURN_STATUS(GPUMemcpyHtoD( 129 | (GPUDevicePtr_t)this->storage[idx], (void*)value, 130 | storage_size) 131 | ); 132 | return Status::Succ; 133 | } 134 | 135 | Status ExecutorBase::get_output(std::vector& out) { 136 | size_t input_storage_idx; 137 | if (find_storage_idx("output", input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound); 138 | StorageInfo& storage_info = this->model->storage[input_storage_idx]; 139 | if (Model::get_stype_size(storage_info.stype) != sizeof(float)) RETURN_STATUS(Status::Fail); 140 | out.resize(storage_info.size); 141 | return get_data(input_storage_idx, (void*)out.data(), storage_info.size * sizeof(float)); 142 | } 143 | 144 | Status ExecutorBase::get_output(void* out, size_t len) { 145 | size_t input_storage_idx; 146 | if (find_storage_idx("output", input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound); 147 | StorageInfo& storage_info = this->model->storage[input_storage_idx]; 148 | size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size; 149 | if (len < storage_size) RETURN_STATUS(Status::Fail); 150 | return get_data(input_storage_idx, out, len); 151 | } 152 | 153 | Status ExecutorBase::get_data(int idx, void* out, size_t len) { 154 | if (idx >= this->storage.size()) RETURN_STATUS(Status::OutOfRange); 155 | StorageInfo& storage_info = this->model->storage[idx]; 156 | size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size; 157 | if (len < storage_size) RETURN_STATUS(Status::Fail); 158 | GPU_RETURN_STATUS(GPUMemcpyDtoH( 159 | out, (GPUDevicePtr_t)this->storage[idx], storage_size 160 | )); 161 | return Status::Succ; 162 | } 163 | 164 | Status ExecutorBase::find_storage_idx(const std::string& name, size_t& idx) { 165 | // TODO: O(n) -> O(1) 166 | for (size_t i = 0; i < this->storage.size(); i++) { 167 | StorageInfo& storage_info = this->model->storage[i]; 168 | if (storage_info.name == name) { 169 | idx = i; 170 | return Status::Succ; 171 | } 172 | } 173 | RETURN_STATUS(Status::NotFound); 174 | return Status::NotFound; // otherwise, the compiler thinks no return value. 175 | } 176 | 177 | size_t ExecutorBase::num_kernels() const { 178 | return model->kernels.size(); 179 | } 180 | 181 | 182 | void ExecutorBase::set_stream(GPUStream_t stream) { 183 | s = stream; 184 | } 185 | 186 | 187 | GPUStream_t ExecutorBase::stream() const { 188 | return s; 189 | } 190 | 191 | Status ExecutorBase::execute(GPUStream_t stream) { 192 | execute_to(num_kernels()); 193 | return Status::Succ; 194 | } 195 | 196 | Status ExecutorBase::execute_to(int idx, GPUStream_t stream) { 197 | for (int i = 0; i < idx; i++) { 198 | RETURN_STATUS(launch_kernel(i, stream)); 199 | } 200 | GPU_RETURN_STATUS(GPUStreamSynchronize(stream)); 201 | return Status::Succ; 202 | } 203 | 204 | Status ExecutorBase::execute_kernel(int idx, GPUStream_t stream) { 205 | if (idx >= num_kernels()) RETURN_STATUS(Status::OutOfRange); 206 | RETURN_STATUS(launch_kernel(idx, stream)); 207 | GPU_RETURN_STATUS(GPUStreamSynchronize(stream)); 208 | return Status::Succ; 209 | } 210 | 211 | Status ExecutorBase::launch_kernel(int kernel_offset, GPUStream_t stream) { 212 | int i = kernel_offset; 213 | std::string& func_name = this->model->kernels[i].name; 214 | GPUFunction_t func = this->kernels[func_name]; 215 | uint32_t *launch_params = this->model->kernels[i].launch_params; 216 | // std::cout << func_name << std::endl; 217 | GPU_RETURN_STATUS(GPUModuleLaunchKernel(func, 218 | launch_params[0], launch_params[1], launch_params[2], 219 | launch_params[3], launch_params[4], launch_params[5], 220 | 0, stream, (void **)this->raw_args[i].data(), 0 221 | )); 222 | return Status::Succ; 223 | } 224 | 225 | } // namespace executor 226 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/executor_base.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "reef/executor/model.h" 3 | #include "reef/util/common.h" 4 | 5 | 6 | #ifdef __REEF_HIP_GPU__ 7 | #include "reef/executor/hip/hip_impl.h" 8 | #endif 9 | #ifdef __REEF_CUDA_GPU__ 10 | #include "reef/executor/cuda/cuda_impl.h" 11 | #endif 12 | 13 | namespace reef { 14 | namespace executor { 15 | 16 | class ExecutorBase { 17 | public: 18 | ExecutorBase(); 19 | virtual ~ExecutorBase(); 20 | 21 | Status load_model_from_file(const char* json_file_path, const char* co_file_path); 22 | 23 | virtual Status load_model_from_GPU_module(const char* json_file_path, GPUModule_t module); 24 | 25 | Status load_param_from_file(const char* param_file_path); 26 | 27 | Status set_input(const std::string& key, const std::vector& value); 28 | 29 | Status set_input(int idx, const void* value, size_t len); 30 | 31 | Status set_input(const std::string& key, const void* value, size_t len); 32 | 33 | Status get_data_size(const std::string& key, size_t &size); 34 | 35 | Status get_output(std::vector& out); 36 | 37 | Status get_output(void* out, size_t len); 38 | 39 | Status get_data(int idx, void* out, size_t len); 40 | 41 | Status execute(GPUStream_t stream = GPUStreamDefault); 42 | 43 | Status execute_to(int idx, GPUStream_t stream = GPUStreamDefault); 44 | 45 | Status execute_kernel(int idx, GPUStream_t stream = GPUStreamDefault); 46 | 47 | size_t num_kernels() const; 48 | 49 | void set_stream(GPUStream_t stream); 50 | 51 | GPUStream_t stream() const; 52 | 53 | std::unique_ptr model; 54 | protected: 55 | Status init_executor_base(const char* json_file_path, GPUModule_t module); 56 | 57 | virtual Status launch_kernel(int kernel_offset, GPUStream_t stream); 58 | 59 | Status find_storage_idx(const std::string& name, size_t &idx); 60 | 61 | protected: 62 | 63 | std::vector storage; 64 | std::unordered_map kernels; 65 | std::vector> raw_args; 66 | 67 | GPUModule_t base_mod; 68 | GPUStream_t s; 69 | }; 70 | 71 | 72 | } // namespace executor 73 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/hip/hip_impl.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/executor/hip/hip_impl.h" 2 | #include "reef/util/common.h" 3 | 4 | #include 5 | 6 | namespace reef { 7 | namespace executor { 8 | 9 | uint32_t GPUConfig::get_num_cus() { 10 | // TODO: dynamic load CU nums 11 | return 60; 12 | } 13 | 14 | 15 | 16 | Status GPUConfig::get_kernel_address(const char* name, GPUModule_t mod, GPUFunctionPtr_t& ret) { 17 | hipFunction_t temp; 18 | GPU_RETURN_STATUS(hipModuleGetFunction(&temp, mod, name)); 19 | hipFunctionWGInfo_t wgInfo; 20 | GPU_RETURN_STATUS(hipFuncGetWGInfo(temp, &wgInfo)); 21 | hipDeviceptr_t temp_buf; 22 | GPU_RETURN_STATUS(hipMalloc(&temp_buf, 64)); 23 | int buf[24]; 24 | int size = 24; 25 | 26 | GPU_RETURN_STATUS(hipMemcpyDtoD(temp_buf, (hipDeviceptr_t)wgInfo.baseAddress, size)); 27 | GPU_RETURN_STATUS(hipMemcpy(buf, temp_buf, size, hipMemcpyDeviceToHost)); 28 | GPU_RETURN_STATUS(hipFree(temp_buf)); 29 | 30 | ret = wgInfo.baseAddress + *(long long int*)(&buf[4]); 31 | return Status::Succ; 32 | } 33 | 34 | Status GPUConfig::get_kernel_resource(GPUFunction_t func, KernelResource& ret) { 35 | hipFunctionWGInfo_t wg_info; 36 | GPU_RETURN_STATUS(hipFuncGetWGInfo(func, &wg_info)); 37 | ret.shared_memory = wg_info.usedLDSSize_; 38 | ret.vgprs = wg_info.usedVGPRs_; 39 | ret.sgprs = wg_info.usedSGPRs_; 40 | ret.stack_size = wg_info.privateMemSize_; 41 | return Status::Succ; 42 | } 43 | 44 | 45 | GPUConfig::KernelResource GPUConfig::max_resource( 46 | const KernelResource& kr1, const KernelResource& kr2) { 47 | KernelResource ret; 48 | ret.sgprs = std::max(kr1.sgprs, kr2.sgprs); 49 | ret.vgprs = std::max(kr1.vgprs, kr2.vgprs); 50 | ret.shared_memory = std::max(kr1.shared_memory, kr2.shared_memory); 51 | ret.stack_size = std::max(kr1.stack_size, kr2.stack_size); 52 | return ret; 53 | } 54 | 55 | int GPUConfig::calculate_occupancy(const KernelResource& resource, dim3 block_dim) { 56 | int vgprs = align_up(resource.vgprs, 4); 57 | int sgprs = align_up(resource.sgprs, 8); 58 | int shared_mem = align_up(resource.shared_memory, 256); 59 | int block_size = (int)align_up(block_dim.x * block_dim.y * block_dim.z, 64); 60 | 61 | int max_gpr_waves = (16 * 1024 / (vgprs * 64)) * 4; 62 | max_gpr_waves = std::min(max_gpr_waves, (800 / sgprs) * 4); 63 | max_gpr_waves = std::min(max_gpr_waves, 40); 64 | 65 | int max_gpr_blocks = max_gpr_waves * 64 / block_size; 66 | int max_shared_mem_blocks = 64 * 1024 / block_size; 67 | 68 | int max_thread_blocks = 2048 / block_size; 69 | 70 | int occupancy = std::min(max_gpr_blocks, max_shared_mem_blocks); 71 | occupancy = std::min(occupancy, max_thread_blocks); 72 | 73 | return occupancy; 74 | } 75 | 76 | 77 | bool GPUStreamEmpty(GPUStream_t s) { 78 | hipError_t res = hipStreamQuery(s); 79 | return hipSuccess == res; 80 | } 81 | 82 | 83 | } // namespace executor 84 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/hip/hip_impl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "reef/util/common.h" 4 | 5 | #define GPUInit hipInit 6 | #define GPUDeviceGet hipDeviceGet 7 | #define GPUModuleLoad hipModuleLoad 8 | #define GPUModuleGetFunction hipModuleGetFunction 9 | #define GPUMalloc hipMalloc 10 | #define GPUMemcpyHtoD hipMemcpyHtoD 11 | #define GPUMemcpyDtoH hipMemcpyDtoH 12 | #define GPUModuleLaunchKernel hipModuleLaunchKernel 13 | #define GPUStreamDefault hipStreamDefault 14 | #define GPUStreamSynchronize hipStreamSynchronize 15 | #define GPUDeviceSynchronize hipDeviceSynchronize 16 | #define GPUStreamCreate hipStreamCreate 17 | #define GPUStreamQuery hipStreamQuery 18 | #define GPUStatusOK hipSuccess 19 | #define GPUFree hipFree 20 | #define GPUWriteValue32Async hipStreamWriteValue32 21 | #define GPUClearHostQueue hipStreamClearQueue 22 | #define GPUResetCU hipResetWavefronts 23 | #define GPUMemset hipMemset 24 | 25 | #define GPU_RETURN_STATUS(cmd) \ 26 | {\ 27 | hipError_t error = cmd;\ 28 | if (error != hipSuccess) {\ 29 | LOG(ERROR) << "hip error: " << hipGetErrorString(error) << "at " << __FILE__ << ":" << __LINE__; \ 30 | return Status::Fail;\ 31 | }\ 32 | } 33 | 34 | #define ASSERT_GPU_ERROR(cmd) \ 35 | {\ 36 | hipError_t error = cmd;\ 37 | if (error != hipSuccess) {\ 38 | LOG(ERROR) << "hip error: " << hipGetErrorString(error) << "at " << __FILE__ << ":" << __LINE__; \ 39 | exit(EXIT_FAILURE);\ 40 | }\ 41 | } 42 | 43 | 44 | namespace reef { 45 | namespace executor { 46 | 47 | typedef hipDeviceptr_t GPUDevicePtr_t; 48 | typedef hipFunction_t GPUFunction_t; 49 | typedef hipDevice_t GPUDevice_t; 50 | typedef hipModule_t GPUModule_t; 51 | typedef hipError_t GPUError_t; 52 | typedef hipStream_t GPUStream_t; 53 | 54 | typedef unsigned long long int GPUFunctionPtr_t; 55 | 56 | bool GPUStreamEmpty(GPUStream_t s); 57 | 58 | class GPUConfig { 59 | public: 60 | static uint32_t get_num_cus(); 61 | static Status get_kernel_address(const char* name, GPUModule_t mod, GPUFunctionPtr_t& ret); 62 | 63 | struct KernelResource { 64 | int shared_memory; 65 | int vgprs; 66 | int sgprs; 67 | int stack_size; 68 | }; 69 | 70 | static KernelResource max_resource(const KernelResource& kr1, const KernelResource& kr2); 71 | 72 | static Status get_kernel_resource(GPUFunction_t func, KernelResource& ret); 73 | 74 | static int calculate_occupancy(const KernelResource& resource, dim3 thread_idx); 75 | }; 76 | 77 | } // namespace executor 78 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/hybrid_executor.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/executor/hybrid_executor.h" 2 | 3 | namespace reef { 4 | namespace executor { 5 | 6 | HybridExecutor::HybridExecutor() { 7 | 8 | } 9 | 10 | HybridExecutor::~HybridExecutor() { 11 | 12 | } 13 | 14 | Status HybridExecutor::load_hybrid_model_from_file( 15 | const char* json_file_path, 16 | const char* profile_file_path, 17 | const char* trans_co_file_path, 18 | const char* preempt_co_file_path) 19 | { 20 | GPUModule_t trans_module, preempt_module; 21 | LOG(INFO) << std::string(trans_co_file_path); 22 | GPU_RETURN_STATUS(GPUModuleLoad(&trans_module, trans_co_file_path)); 23 | GPU_RETURN_STATUS(GPUModuleLoad(&preempt_module, preempt_co_file_path)); 24 | 25 | return load_hybrid_model_from_GPU_module( 26 | json_file_path, 27 | profile_file_path, 28 | trans_module, 29 | preempt_module 30 | ); 31 | } 32 | 33 | Status HybridExecutor::load_hybrid_model_from_GPU_module( 34 | const char* json_file_path, 35 | const char* profile_file_path, 36 | GPUModule_t trans_module, 37 | GPUModule_t preempt_module) 38 | { 39 | // 1. Init transformed module 40 | Status ret = load_model_from_GPU_module(json_file_path, trans_module); 41 | if (ret != Status::Succ) return ret; 42 | 43 | // 2. Init preemptable module 44 | return init_hybrid_executor( 45 | json_file_path, 46 | profile_file_path, 47 | trans_module, 48 | preempt_module 49 | ); 50 | } 51 | 52 | Status HybridExecutor::init_hybrid_executor( 53 | const char* json_file_path, 54 | const char* profile_file_path, 55 | GPUModule_t trans_module, 56 | GPUModule_t preempt_module) 57 | { 58 | // TODO: load profile 59 | preempt_mod = preempt_module; 60 | trans_mod = trans_module; 61 | 62 | // 1. load preemptable kernels 63 | size_t num_kernels = model->kernels.size(); 64 | preempt_kernels.resize(num_kernels); 65 | for (size_t i = 0; i < num_kernels; i++) { 66 | GPU_RETURN_STATUS(GPUModuleGetFunction( 67 | &preempt_kernels[i], preempt_mod, model->kernels[i].name.c_str() 68 | )); 69 | } 70 | 71 | // 2. allocate preempt flag 72 | GPU_RETURN_STATUS(GPUMalloc((GPUDevicePtr_t*)&preempt_flag, 4)); 73 | int value = 0; 74 | GPU_RETURN_STATUS(GPUMemcpyHtoD(preempt_flag, &value, 4)); 75 | // TODO: remove this 76 | GPU_RETURN_STATUS(GPUMalloc((GPUDevicePtr_t*)&task_slot_base, num_kernels*4)); 77 | GPU_RETURN_STATUS(GPUMemset(task_slot_base, 0, num_kernels*4)); 78 | 79 | // 3. prepare preemptable kernel args 80 | preempt_args.resize(num_kernels); 81 | for (int i = 0; i < num_kernels; i++) { 82 | auto &kernel_args = preempt_args[i]; 83 | kernel_args.push_back(&preempt_flag); 84 | kernel_args.push_back(&task_slot_base); 85 | auto &origin_args = raw_args[i]; 86 | for (int j = 0; j < origin_args.size(); j++) { 87 | kernel_args.push_back(origin_args[j]); 88 | } 89 | } 90 | 91 | // 4. load profiles 92 | model_profile.reset(ModelProfile::from_json(profile_file_path)); 93 | for (int i = 0; i < num_kernels; i++) { 94 | auto& kernel_arg = trans_args[i]; 95 | kernel_arg.profile = model_profile->kernel_latency[model->kernels[i].name]; 96 | } 97 | 98 | // 5. prepare task slots for wait-based preemption 99 | GPU_RETURN_STATUS(hipHostMalloc((void**)&task_slots_host, num_kernels * sizeof(int), hipHostMallocDefault)); 100 | GPU_RETURN_STATUS(hipHostMalloc((void**)&task_slots_host_empty, num_kernels * sizeof(int), hipHostMallocDefault)); 101 | memset(task_slots_host, 0, num_kernels * sizeof(int)); 102 | memset(task_slots_host_empty, 0, num_kernels * sizeof(int)); 103 | return Status::Succ; 104 | } 105 | 106 | 107 | Status HybridExecutor::set_preempt_flag(GPUDevicePtr_t flag) { 108 | GPU_RETURN_STATUS(GPUFree(preempt_flag)); // TODO: avoid double free 109 | preempt_flag = flag; 110 | return Status::Succ; 111 | } 112 | 113 | Status HybridExecutor::execute_preemptale(GPUStream_t stream) { 114 | for (int i = 0; i < this->model->kernels.size(); i++) { 115 | Status ret = launch_preempt_kernel(i, stream); 116 | if (ret != Status::Succ) return ret; 117 | } 118 | GPU_RETURN_STATUS(GPUStreamSynchronize(stream)); 119 | return Status::Succ; 120 | } 121 | 122 | Status HybridExecutor::reset_task_slot_async(int kernel_offset, GPUStream_t stream) { 123 | GPU_RETURN_STATUS(GPUWriteValue32Async( 124 | stream, (GPUDevicePtr_t)((char*)task_slot_base + kernel_offset * 4), 0, 0 125 | )); 126 | return Status::Succ; 127 | } 128 | 129 | Status HybridExecutor::get_reset_kernel_idx(int start_inx, int& ret) { 130 | return Status::Succ; // TODO: 131 | } 132 | 133 | void HybridExecutor::reset_task_slots(hipStream_t stream) { 134 | ASSERT_GPU_ERROR(hipMemcpyHtoDAsync(task_slot_base, task_slots_host_empty, 4 * this->num_kernels(), stream)); 135 | } 136 | 137 | void HybridExecutor::copy_be_kernel_offset( hipStream_t stream) { 138 | ASSERT_GPU_ERROR(hipMemcpyDtoHAsync(task_slots_host, task_slot_base, 4 * this->num_kernels(), stream)); 139 | } 140 | 141 | int HybridExecutor::get_be_kernel_offset(int begin) { 142 | // TODO: binary search 143 | for (int i = begin; i < this->num_kernels(); i++) { 144 | int finished_num = task_slots_host[i]; 145 | int required_num = trans_args[i].block_num; 146 | 147 | if (finished_num < required_num) return i; 148 | } 149 | // for (int i = begin; i > 0) 150 | return this->num_kernels(); 151 | } 152 | 153 | Status HybridExecutor::launch_preempt_kernel(int kernel_offset, GPUStream_t stream) { 154 | KernelArg &kernel_arg = trans_args[kernel_offset]; 155 | GPUFunction_t func = preempt_kernels[kernel_offset]; 156 | // LOG(INFO) << "launch " << kernel_offset; 157 | GPUDevicePtr_t task_slot = (GPUDevicePtr_t)((char*)task_slot_base + kernel_offset * 4); 158 | this->preempt_args[kernel_offset][1] = &task_slot; // TODO: 159 | GPU_RETURN_STATUS(GPUModuleLaunchKernel(func, 160 | kernel_arg.task_dim.x, kernel_arg.task_dim.y, kernel_arg.task_dim.z, 161 | kernel_arg.thread_dim.x, kernel_arg.thread_dim.y, kernel_arg.thread_dim.z, 162 | 0, stream, (void**)(this->preempt_args[kernel_offset].data()), 0 163 | )); 164 | return Status::Succ; 165 | } 166 | 167 | } // namespace executor 168 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/hybrid_executor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "reef/executor/trans_executor.h" 3 | 4 | 5 | namespace reef { 6 | namespace server { 7 | class REEFScheduler; 8 | } // namespace server 9 | namespace executor { 10 | 11 | // HybridExecutor contains two version of the model 12 | // (1) transformed version, which is used to perform dynamic kernel padding 13 | // (2) preemptable version, which is used to perform reset-based preemption (for best-effort tasks). 14 | // 15 | // The transformed version is inherited from TransExecutor. 16 | // 17 | // The preemptable version adds preemption flag based on the raw model. 18 | class HybridExecutor : public TransExecutor { 19 | 20 | friend class server::REEFScheduler; 21 | 22 | public: 23 | HybridExecutor(); 24 | virtual ~HybridExecutor(); 25 | Status load_hybrid_model_from_file( 26 | const char* json_file_path, 27 | const char* profile_file_path, 28 | const char* trans_co_file_path, 29 | const char* preempt_co_file_path); 30 | 31 | Status load_hybrid_model_from_GPU_module( 32 | const char* json_file_path, 33 | const char* profile_file_path, 34 | GPUModule_t trans_module, 35 | GPUModule_t preempt_module 36 | ); 37 | 38 | Status execute_preemptale(GPUStream_t stream = GPUStreamDefault); 39 | 40 | Status set_preempt_flag(GPUDevicePtr_t flag); 41 | 42 | Status reset_task_slot_async(int kernel_offset, GPUStream_t stream); 43 | 44 | Status get_reset_kernel_idx(int start_inx, int& ret); 45 | 46 | void reset_task_slots(hipStream_t stream); 47 | 48 | void copy_be_kernel_offset(hipStream_t stream); 49 | 50 | int get_be_kernel_offset(int begin); 51 | protected: 52 | Status init_hybrid_executor( 53 | const char* json_file_path, 54 | const char* profile_file_path, 55 | GPUModule_t trans_module, 56 | GPUModule_t preempt_module 57 | ); 58 | 59 | Status launch_preempt_kernel(int kernel_offset, GPUStream_t stream); 60 | 61 | protected: 62 | GPUModule_t preempt_mod; 63 | GPUModule_t trans_mod; 64 | 65 | std::vector preempt_kernels; 66 | std::vector> preempt_args; 67 | GPUDevicePtr_t preempt_flag; 68 | GPUDevicePtr_t task_slot_base; // TODO: remove this 69 | 70 | int* task_slots_host_empty; 71 | int* task_slots_host; 72 | 73 | std::shared_ptr model_profile; 74 | }; 75 | 76 | } // namespace executor 77 | } // namespace reef 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/reef/executor/model.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "reef/executor/model.h" 5 | #include "reef/util/json.h" 6 | 7 | namespace reef { 8 | namespace executor { 9 | 10 | using reef::util::JsonObject; 11 | using reef::util::JsonParser; 12 | 13 | Model* Model::from_json(const char* json_file) { 14 | std::ifstream fs(json_file); 15 | std::string tmp, str = ""; 16 | 17 | while (getline(fs, tmp)) str += tmp; 18 | fs.close(); 19 | 20 | JsonObject* jobj = JsonParser::parse(str); 21 | 22 | Model* m = new Model; 23 | for (auto sinfo : jobj->mval["storage"]->lval) { 24 | m->storage.push_back(StorageInfo{ 25 | sinfo->mval["name"]->sval, 26 | sinfo->mval["size"]->ival, 27 | sinfo->mval["stype"]->sval 28 | }); 29 | } 30 | 31 | for (auto kinfo : jobj->mval["kernels"]->lval) { 32 | KernelInfo k; 33 | 34 | k.name = kinfo->mval["name"]->sval; 35 | for (auto arg : kinfo->mval["args"]->lval) 36 | k.args.push_back(arg->ival); 37 | 38 | assert(kinfo->mval["launch_params"]->lval.size() == 6); 39 | for (int i = 0; i < 6; i++) 40 | k.launch_params[i] = kinfo->mval["launch_params"]->lval[i]->ival; 41 | 42 | m->kernels.push_back(k); 43 | } 44 | 45 | for (auto arg : jobj->mval["args"]->lval) { 46 | m->args.push_back(arg->ival); 47 | } 48 | 49 | for (auto shared_memory : jobj->mval["shared_memory"]->mval) { 50 | m->shared_memory[shared_memory.first] = shared_memory.second->ival; 51 | } 52 | delete jobj; 53 | 54 | return m; 55 | } 56 | 57 | ModelProfile* ModelProfile::from_json(const char* json_file) { 58 | std::ifstream fs(json_file); 59 | std::string tmp, str = ""; 60 | 61 | while (getline(fs, tmp)) str += tmp; 62 | fs.close(); 63 | 64 | JsonObject* jobj = JsonParser::parse(str); 65 | ModelProfile* model_profile = new ModelProfile; 66 | model_profile->model_latency = jobj->mval["model_latency"]->ival; 67 | 68 | for (auto &pair : jobj->mval["kernel_latency"]->mval) { 69 | const std::string& kernel_name = pair.first; 70 | KernelProfile profile; 71 | auto kernel_profile = pair.second->mval; 72 | profile.total_latency = kernel_profile["total_latency"]->ival; 73 | for (auto value : kernel_profile["latency"]->lval) { 74 | profile.latency.push_back(value->ival); 75 | } 76 | model_profile->kernel_latency.insert({kernel_name, profile}); 77 | } 78 | delete jobj; 79 | return model_profile; 80 | } 81 | 82 | #define PARAM_MAGIC "TVM_MODEL_PARAMS" 83 | 84 | ModelParam* ModelParamParser::parse_from_file(const char* param_file) { 85 | FILE* fp; 86 | fp = fopen(param_file, "rb"); 87 | char magic[sizeof(PARAM_MAGIC)]; 88 | size_t res = fread(magic, sizeof(char), sizeof(PARAM_MAGIC), fp); 89 | assert(res == sizeof(PARAM_MAGIC)); 90 | assert(std::string(magic) == PARAM_MAGIC); 91 | 92 | uint64_t params_size; 93 | res = fread(¶ms_size, sizeof(uint64_t), 1, fp); 94 | assert(res == 1); 95 | assert(params_size != 0); 96 | 97 | ModelParam* params = new ModelParam(params_size); 98 | for (uint64_t i = 0; i < params_size; i++) { 99 | char key_buf[256]; 100 | uint64_t key_len = 0; 101 | while(true) { 102 | char c; 103 | res = fread(&c, sizeof(char), 1, fp); 104 | assert(res == 1); 105 | key_buf[key_len] = c; 106 | key_len++; 107 | if (c == '\0') break; 108 | } 109 | std::string key(key_buf); 110 | uint64_t array_size; 111 | res = fread(&array_size, sizeof(uint64_t), 1, fp); 112 | assert(res == 1); 113 | assert(array_size != 0); 114 | std::vector array(array_size); 115 | array.resize(array_size); 116 | res = fread(array.data(), sizeof(float), array_size, fp); 117 | assert(res == array_size); 118 | params->insert({key, array}); 119 | } 120 | return params; 121 | } 122 | 123 | std::string ModelProfile::to_json() { 124 | std::ostringstream ss; 125 | 126 | ss << "{\"model_latency\":" << model_latency << ",\"kernel_latency\":{"; 127 | size_t i = 0; 128 | for (auto pair : this->kernel_latency) { 129 | ss << "\"" << pair.first << "\":{\"total_latency\":" << pair.second.total_latency << ", \"latency\":[" ; 130 | 131 | size_t j = 0; 132 | for (auto latency : pair.second.latency) { 133 | ss << latency; 134 | j++; 135 | if (j != pair.second.latency.size()) { 136 | ss << ","; 137 | } 138 | } 139 | ss << "]}"; 140 | i++; 141 | if (i != this->kernel_latency.size()) ss << ","; 142 | } 143 | ss << "}}"; 144 | return ss.str(); 145 | } 146 | 147 | size_t Model::get_stype_size(std::string &stype) { 148 | if (stype == "float32") return 4; 149 | if (stype == "int64") return 8; 150 | if (stype == "byte") return 1; 151 | if (stype == "uint1") return 1; 152 | if (stype == "int32") return 4; 153 | std::cout << stype << " is undefined" << std::endl; 154 | assert(false); 155 | return 0; 156 | } 157 | 158 | } // namespace executor 159 | } // namepsace reef -------------------------------------------------------------------------------- /src/reef/executor/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace reef { 12 | namespace executor { 13 | 14 | class StorageInfo { 15 | public: 16 | std::string name; 17 | size_t size; 18 | std::string stype; 19 | }; 20 | 21 | class KernelInfo { 22 | public: 23 | std::string name; 24 | uint32_t launch_params[6]; 25 | std::vector args; 26 | }; 27 | 28 | class Model { 29 | public: 30 | std::vector storage; 31 | std::vector kernels; 32 | std::vector args; 33 | std::unordered_map shared_memory; 34 | 35 | public: 36 | static Model* from_json(const char* json_file); 37 | static size_t get_stype_size(std::string &stype); 38 | }; 39 | 40 | class KernelProfile { 41 | public: 42 | std::vector latency; // microsecond 43 | int total_latency; 44 | int estimated_latency(int occupancy, int task_num_per_block); 45 | }; 46 | 47 | class ModelProfile { 48 | public: 49 | int model_latency; 50 | std::unordered_map kernel_latency; 51 | std::string to_json(); 52 | static ModelProfile* from_json(const char* json_file); 53 | }; 54 | 55 | typedef std::unordered_map> ModelParam; 56 | 57 | class ModelParamParser { 58 | public: 59 | static ModelParam* parse_from_file(const char* param_file); 60 | }; 61 | 62 | 63 | 64 | } // namespace executor 65 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/trans_executor.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/executor/trans_executor.h" 2 | 3 | namespace reef { 4 | namespace executor { 5 | 6 | TransExecutor::TransExecutor() {} 7 | TransExecutor::~TransExecutor() {} 8 | 9 | Status TransExecutor::load_model_from_GPU_module(const char* json_file_path, GPUModule_t module) { 10 | Status ret = init_executor_base(json_file_path, module); 11 | if (ret != Status::Succ) return ret; 12 | return init_rt_executor(json_file_path, module); 13 | } 14 | 15 | 16 | Status TransExecutor::init_rt_executor(const char* json_file_path, GPUModule_t module) { 17 | size_t num_kernel_calls = model->kernels.size(); 18 | int num_cus = GPUConfig::get_num_cus(); 19 | trans_args.resize(num_kernel_calls); 20 | 21 | bool need_load_kernels = true; // TODO: move to class config 22 | 23 | // 1. fullfil the trans_args, which will be used to launch transformed kernels 24 | for (size_t i = 0; i < num_kernel_calls; i++) { 25 | KernelArg &kernel_arg = trans_args[i]; 26 | std::string& kernel_name = model->kernels[i].name; 27 | 28 | uint32_t *launch_params = model->kernels[i].launch_params; 29 | kernel_arg.task_dim = dim3(launch_params[0],launch_params[1], launch_params[2]); 30 | kernel_arg.thread_dim = dim3(launch_params[3],launch_params[4], launch_params[5]); 31 | kernel_arg.block_num = launch_params[0] * launch_params[1] * launch_params[2]; 32 | kernel_arg.block_offset = 0; 33 | kernel_arg.cu_lower = 0; 34 | kernel_arg.cu_upper = GPUConfig::get_num_cus(); 35 | 36 | if (need_load_kernels) { 37 | RETURN_STATUS( 38 | GPUConfig::get_kernel_address( 39 | kernel_name.c_str(), module, kernel_arg.funcion_pointer 40 | ) 41 | ); 42 | kernel_arg.kernel = kernels[kernel_name]; 43 | RETURN_STATUS( 44 | GPUConfig::get_kernel_resource( 45 | kernel_arg.kernel, 46 | kernel_arg.resource 47 | ); 48 | ) 49 | } 50 | } 51 | 52 | // 2. prepare REAL kernel params (model params) 53 | size_t num_total_kernel_args = 0; 54 | size_t func_args_ptr_buffer_size = 0; 55 | for (size_t i = 0; i < num_kernel_calls; i++) { 56 | num_total_kernel_args += raw_args[i].size(); 57 | } 58 | 59 | func_args_ptr_buffer_size = align_up(num_total_kernel_args * sizeof(float *), (size_t)4096); 60 | 61 | GPU_RETURN_STATUS( 62 | GPUMalloc((GPUDevicePtr_t*)&func_args_base_ptr, func_args_ptr_buffer_size) 63 | ); 64 | size_t func_args_offset = 0; 65 | 66 | std::vector host_args(num_total_kernel_args); 67 | 68 | for (size_t i = 0; i < num_kernel_calls; i++) { 69 | KernelArg &kernel_arg = trans_args[i]; 70 | kernel_arg.args = (GPUDevicePtr_t)( 71 | (size_t)func_args_base_ptr + func_args_offset * sizeof(float*) 72 | ); 73 | for (size_t arg_idx : model->kernels[i].args) { 74 | host_args[func_args_offset] = storage[arg_idx]; 75 | func_args_offset ++; 76 | } 77 | } 78 | 79 | GPU_RETURN_STATUS( 80 | GPUMemcpyHtoD((GPUDevicePtr_t)func_args_base_ptr, (void*)host_args.data(), num_total_kernel_args * sizeof(float*)) 81 | ); 82 | 83 | // 3. calculate num_layers 84 | if (need_load_kernels) { 85 | for (size_t i = 0; i < num_kernel_calls; i++) { 86 | KernelArg &kernel_arg = trans_args[i]; 87 | KernelInfo &info = model->kernels[i]; 88 | std::string &kernel_name = info.name; 89 | GPUFunction_t func = kernels[kernel_name]; 90 | 91 | int max_layers = GPUConfig::calculate_occupancy( 92 | kernel_arg.resource, 93 | kernel_arg.thread_dim 94 | ); 95 | int num_layers = align_up(kernel_arg.block_num, num_cus) / num_cus; 96 | if (num_layers > max_layers) num_layers = max_layers; 97 | kernel_arg.min_occupancy = num_layers; 98 | } 99 | } 100 | 101 | // 4. prepare proxy kernels 102 | proxy_kernels.resize(10); 103 | proxy_kernels_nostack.resize(10); 104 | for (int i = 1; i <= 10; i++) { 105 | { 106 | std::stringstream kernel_name; 107 | kernel_name << REEF_PROXY_KERNEL_PREFIX() << i; 108 | 109 | GPUFunction_t proxy_kernel; 110 | GPU_RETURN_STATUS(GPUModuleGetFunction( 111 | &proxy_kernel, module, kernel_name.str().c_str()) 112 | ); 113 | proxy_kernels[i-1] = proxy_kernel; 114 | } 115 | 116 | { 117 | std::stringstream kernel_name; 118 | kernel_name << REEF_PROXY_KERNEL_NOSTACK_PREFIX() << i; 119 | 120 | GPUFunction_t proxy_kernel; 121 | GPU_RETURN_STATUS(GPUModuleGetFunction( 122 | &proxy_kernel, module, kernel_name.str().c_str())); 123 | proxy_kernels_nostack[i-1] = proxy_kernel; 124 | } 125 | } 126 | GPUConfig::KernelResource kr; 127 | RETURN_STATUS(GPUConfig::get_kernel_resource(proxy_kernels[0], kr)); 128 | max_stack_size = kr.stack_size; // TODO: move to GPU interface 129 | return Status::Succ; 130 | } 131 | 132 | Status TransExecutor::launch_kernel(int kernel_offset, GPUStream_t stream) { 133 | std::string& func_name = this->model->kernels[kernel_offset].name; 134 | GPUFunction_t func = this->kernels[func_name]; 135 | int num_cus = GPUConfig::get_num_cus(); 136 | uint32_t *launch_params = this->model->kernels[kernel_offset].launch_params; 137 | 138 | KernelArg &kernel_arg = this->trans_args[kernel_offset]; 139 | int logical_layers = align_up(kernel_arg.block_num, num_cus) / num_cus; 140 | int cu_partition = align_up(kernel_arg.block_num, logical_layers) / logical_layers; 141 | void* placeholder = nullptr; 142 | 143 | void *arg[] = { 144 | &placeholder, 145 | &(kernel_arg.min_occupancy), 146 | &(kernel_arg.block_num), 147 | &(kernel_arg.block_offset), 148 | &(kernel_arg.args), 149 | 150 | // These args are not actually used. 151 | &placeholder, 152 | &(kernel_arg.min_occupancy), 153 | &(kernel_arg.block_num), 154 | &(kernel_arg.block_offset), 155 | &(kernel_arg.args), 156 | 157 | 158 | &(kernel_arg.cu_upper), 159 | }; 160 | // assert(this->model->shared_memory.find(func_name) != this->model->shared_memory.end()); 161 | // std::cout << "shared: " << this->base_executor->model->shared_memory[func_name] << std::endl; 162 | // unsigned int logical_work_groups = launch_params[0] * launch_params[1] * launch_params[2]; 163 | // unsigned int num_layers = align_up(logical_work_groups, (unsigned int) GPU_NUM_CU) / GPU_NUM_CU; 164 | // unsigned int physical_work_groups = num_layers * GPU_NUM_CU; // align_up(logical_work_groups, num_layers) / num_layers; 165 | 166 | // std::cout << func_name << std::endl; 167 | GPU_RETURN_STATUS(GPUModuleLaunchKernel(func, 168 | num_cus * kernel_arg.min_occupancy, 1, 1, 169 | launch_params[3] * launch_params[4] * launch_params[5], 1, 1, 170 | 128, stream, arg, 0 171 | )); 172 | return Status::Succ; 173 | } 174 | 175 | GPUFunction_t TransExecutor::get_proxy_kernel(const GPUConfig::KernelResource& kr) { 176 | // FIXME: move to GPU interface 177 | static int sgpr_bound[] = { 178 | 102, 102, 102, 102, 102, 179 | 102, 102, 102, 88, 80 180 | }; 181 | 182 | static int vgpr_bound[] = { 183 | 256, 128, 84, 64, 48, 184 | 40, 36, 32, 28, 28 185 | }; 186 | int sgpr_idx = 0, vgpr_idx = 0; 187 | int occupancy = 10; 188 | for (int i = 1; i < 10; i++) { 189 | if (kr.vgprs > vgpr_bound[i] || kr.sgprs > sgpr_bound[i]) { 190 | assert(i > 0); 191 | occupancy = i; 192 | break; 193 | } 194 | } 195 | if (kr.stack_size > 0) 196 | return proxy_kernels[occupancy - 1]; 197 | else 198 | return proxy_kernels_nostack[occupancy - 1]; 199 | } 200 | 201 | } // namespace executor 202 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/executor/trans_executor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "reef/executor/executor_base.h" 3 | 4 | 5 | 6 | namespace reef { 7 | namespace executor { 8 | 9 | 10 | // TransExecutor is used for both real-time tasks and best-effort tasks. 11 | // Instead of using the raw GPU code from DL compiler, TransExecutor executes 12 | // transformed GPU code that support dynamic kernel padding. 13 | // There are mainly two transformations: 14 | // 1. kernel args 15 | // The original kernel looks like: 16 | // 17 | // __global__ void foo(float* a, float* b) { ... } 18 | // 19 | // The transformed kernel looks like: 20 | // 21 | // __global__ void foo( 22 | // void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l, 23 | // void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r, 24 | // int cu_partition) { ... } 25 | // 26 | // The param `param_l` and `param_r` should consist the original kernel arg `float* a, flaot* b`. 27 | // 28 | // The execution of the transformed kernel must follow the `persistent thread` style. 29 | // 30 | // TODO: currently, dynamic kernel padding only support 2 kernels with float params. 31 | // 32 | // 2. proxy kernel 33 | // The new transformed kernels can be called by two ways: 34 | // (1) directly launch the new kernel with new params (usually for test). 35 | // (2) through proxy kernel 36 | // 37 | // Currently, proxy kernel has the same args with transformed kernels. 38 | 39 | class TransExecutor : public ExecutorBase { 40 | public: 41 | TransExecutor(); 42 | virtual ~TransExecutor(); 43 | virtual Status load_model_from_GPU_module(const char* json_file_path, GPUModule_t module) override; 44 | 45 | public: 46 | 47 | class KernelArg { 48 | public: 49 | GPUFunction_t kernel; 50 | GPUFunctionPtr_t funcion_pointer; 51 | dim3 task_dim; 52 | dim3 thread_dim; 53 | // GPUDeviceptr_t task_slots; 54 | int block_num; 55 | int block_offset; 56 | int cu_lower; 57 | int cu_upper; 58 | GPUDevicePtr_t args; 59 | 60 | int min_occupancy; // This is the minimal required occupancy for real-time task. 61 | GPUConfig::KernelResource resource; 62 | KernelProfile profile; 63 | }; 64 | 65 | protected: 66 | Status init_rt_executor(const char* json_file_path, GPUModule_t module); 67 | 68 | virtual Status launch_kernel(int kernel_offset, GPUStream_t stream) override; 69 | 70 | GPUFunction_t get_proxy_kernel(const GPUConfig::KernelResource& kr); 71 | protected: 72 | std::vector trans_args; 73 | GPUDevicePtr_t func_args_base_ptr; 74 | 75 | std::vector proxy_kernels; 76 | std::vector proxy_kernels_nostack; 77 | int max_stack_size; 78 | 79 | virtual std::string REEF_PROXY_KERNEL_PREFIX() const { 80 | return "merge_framework_"; 81 | } 82 | virtual std::string REEF_PROXY_KERNEL_NOSTACK_PREFIX() const { 83 | return "merge_framework_nostack_"; 84 | } 85 | 86 | }; 87 | 88 | } // namespace executor 89 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/protos/reef.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package reef.rpc; 4 | 5 | service REEFService { 6 | // load a DNN model 7 | rpc LoadModel (LoadModelRequest) returns (LoadModelReply) {} 8 | 9 | // create a Task queue 10 | rpc SetPriority (SetPriorityRequest) returns (SetPriorityReply) {} 11 | 12 | // register shared memory 13 | rpc RegisterBlob (RegisterBlobRequest) returns (RegisterBlobReply) {} 14 | 15 | // memcpy device to host 16 | rpc GetBlob (GetBlobRequest) returns (GetBlobReply) {} 17 | 18 | // memcpy host to device 19 | rpc SetBlob (SetBlobRequest) returns (SetBlobReply) {} 20 | 21 | // create an inference task 22 | rpc Infer (InferRequest) returns (InferReply) {} 23 | } 24 | 25 | message LoadModelRequest { 26 | string dir = 1; 27 | string name = 2; 28 | int32 qid = 3; 29 | } 30 | 31 | message LoadModelReply { 32 | bool succ = 1; // TODO: enums 33 | int32 mid = 2; 34 | } 35 | 36 | message SetPriorityRequest { 37 | bool rt = 1; 38 | } 39 | 40 | message SetPriorityReply { 41 | bool succ = 1; 42 | int32 qid = 2; 43 | } 44 | 45 | message RegisterBlobRequest { 46 | int32 mid = 1; 47 | string name = 2; 48 | } 49 | 50 | message RegisterBlobReply { 51 | bool succ = 1; 52 | string key = 2; 53 | int64 size = 3; 54 | } 55 | 56 | message GetBlobRequest { 57 | string key = 1; 58 | } 59 | 60 | message GetBlobReply { 61 | bool succ = 1; 62 | } 63 | 64 | message SetBlobRequest { 65 | string key = 1; 66 | } 67 | 68 | message SetBlobReply { 69 | bool succ = 1; 70 | } 71 | 72 | message InferRequest { 73 | int32 mid = 1; 74 | bool async = 2; 75 | } 76 | 77 | message InferReply { 78 | bool succ = 1; 79 | int32 tid = 2; 80 | } -------------------------------------------------------------------------------- /src/reef/rpc/placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SJTU-IPADS/reef/58dabe0a63fe6979349b358a78aa324cca050e4a/src/reef/rpc/placeholder -------------------------------------------------------------------------------- /src/reef/server/scheduler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "reef/util/threadsafe_queue.h" 9 | #include "reef/util/common.h" 10 | #include "reef/executor/hybrid_executor.h" 11 | 12 | namespace reef { 13 | namespace server { 14 | 15 | class REEFScheduler { 16 | class Model; 17 | public: 18 | typedef uint32_t ModelID; 19 | typedef uint32_t QueueID; 20 | typedef uint32_t TaskID; 21 | 22 | enum ScheduleMode { 23 | NoPreempt, // no preemption 24 | MultiStream, // multiple GPU streams 25 | WaitPreempt, // wait-based preemption 26 | REEF, 27 | Reset // reset-based preemption without DKP 28 | }; 29 | 30 | enum TaskQueueType { 31 | RealTimeQueue, 32 | BestEffortQueue, 33 | }; 34 | 35 | enum TaskState { 36 | Init, 37 | Waiting, 38 | Executing, 39 | Finish 40 | }; 41 | 42 | struct Task { 43 | friend REEFScheduler; 44 | private: 45 | std::shared_ptr model; 46 | QueueID qid; 47 | TaskID id; 48 | volatile TaskState state; 49 | int launch_offset; // the kernel idx that has been launched to host queue 50 | int kernel_offset; // the kernel idx that has been executed 51 | int block_offset; // for DKP 52 | std::mutex mtx; 53 | std::condition_variable cv; 54 | std::chrono::system_clock::time_point submit; // when this task is created 55 | std::chrono::system_clock::time_point start; // when this task is scheduled 56 | std::chrono::system_clock::time_point end; // when this task is completed 57 | bool preempted; 58 | bool padding; 59 | bool padding_to_finish; 60 | public: 61 | bool is_preempted() const; 62 | bool is_padded() const; 63 | bool is_padded_to_complete() const; 64 | std::vector get_timestamp() const; 65 | }; 66 | 67 | public: 68 | REEFScheduler(ScheduleMode _mode = ScheduleMode::REEF); 69 | ~REEFScheduler(); 70 | 71 | Status load_model( 72 | const std::string& model_dir, 73 | const std::string& model_name, 74 | ModelID& mid 75 | ); 76 | 77 | Status load_model( 78 | const std::string& rt_co_path, 79 | const std::string& be_co_path, 80 | const std::string& json_path, 81 | const std::string& profile_path, 82 | const std::string& param_path, 83 | ModelID& mid 84 | ); 85 | 86 | Status create_queue( 87 | const TaskQueueType& qtp, 88 | QueueID& qid 89 | ); 90 | 91 | Status bind_model_queue( 92 | const QueueID& qid, 93 | const ModelID& mid 94 | ); 95 | 96 | Status get_data_size(ModelID mid, const std::string& name, size_t& size); 97 | 98 | Status set_input(ModelID mid, const void* data, size_t len, const std::string& name="data"); 99 | 100 | Status get_output(ModelID mid, void* data, size_t len, const std::string& name="output"); 101 | 102 | Status new_task( 103 | const ModelID& mid, 104 | TaskID& tid 105 | ); 106 | 107 | Status wait_task( 108 | TaskID tid 109 | ); 110 | 111 | Status get_task( 112 | TaskID tid, 113 | std::shared_ptr& t 114 | ); 115 | 116 | ScheduleMode sche_mode() const; 117 | 118 | void set_wait_sync(bool value); 119 | 120 | void set_be_stream_cap(int value); 121 | Status run(); 122 | Status shutdown(); 123 | 124 | int64_t avg_preempt_latency() const; 125 | 126 | int64_t avg_kernel_sel_latency() const; 127 | private: 128 | ScheduleMode mode; 129 | const size_t model_pool_capacity = 1024; 130 | std::atomic_uint32_t model_pool_size; 131 | struct Model { 132 | executor::HybridExecutor executor; 133 | QueueID qid; 134 | }; 135 | std::vector> model_pool; 136 | 137 | 138 | std::atomic_uint32_t task_idx_pool; 139 | std::unordered_map> task_pool; 140 | std::mutex task_pool_mtx; 141 | 142 | struct TaskQueue { 143 | ThreadSafeQueue> task_queue; 144 | executor::GPUStream_t stream; 145 | }; 146 | 147 | const size_t max_num_be_queues = 32; 148 | const QueueID rt_queue_id = 32; // the same with be queue num 149 | std::mutex be_queues_mtx; 150 | std::vector> be_queues; 151 | volatile uint32_t be_queue_cnt; 152 | std::shared_ptr rt_queue; 153 | std::mutex task_cnt_mtx; 154 | std::condition_variable task_cnt_cv; // To wake up the scheduler 155 | volatile uint32_t task_cnt; 156 | bool wait_sync; 157 | 158 | std::unique_ptr scheduler; 159 | executor::GPUStream_t execute_stream, preempt_stream; 160 | executor::GPUDevicePtr_t preempt_flag; 161 | bool preempted; 162 | int be_stream_device_queue_cap; 163 | std::atomic_bool _shutdown; 164 | 165 | uint64_t preempt_count; 166 | uint64_t preempt_latency_sum; 167 | 168 | uint64_t kernel_sel_count; 169 | uint64_t kernel_sel_latency_sum; 170 | private: 171 | Status create_task_queue(std::shared_ptr& ret, bool rt); 172 | void loop_body(); 173 | void execute_be_task(std::shared_ptr& task, std::shared_ptr& tqueue); 174 | void execute_rt_task(std::shared_ptr& task); 175 | void preempt_be_tasks(); 176 | void reset_preempt_flag_async(); 177 | void preempt_reset(); 178 | void preempt_wait(); 179 | void dynamic_kernel_padding(std::shared_ptr& rt_task); 180 | executor::GPUFunction_t get_proxy_kernel( 181 | const executor::GPUConfig::KernelResource& resource, 182 | executor::HybridExecutor* rt_executor, 183 | executor::HybridExecutor* be_executor 184 | ); 185 | }; 186 | 187 | 188 | } // namespace server 189 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/server/server.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/server/server.h" 2 | #include 3 | #include 4 | 5 | namespace reef { 6 | namespace server { 7 | 8 | REEFServer::REEFServer(const std::string& addr) 9 | : server_addr(addr), rpc_server(nullptr) 10 | { 11 | scheduler.reset(new REEFScheduler()); 12 | } 13 | 14 | void REEFServer::run() { 15 | grpc::ServerBuilder builder; 16 | builder.AddListeningPort(server_addr, grpc::InsecureServerCredentials()); 17 | builder.RegisterService(this); 18 | 19 | rpc_server = builder.BuildAndStart(); 20 | scheduler->run(); 21 | } 22 | 23 | void REEFServer::wait() { 24 | ASSERT(rpc_server.get() != nullptr); 25 | rpc_server->Wait(); 26 | } 27 | 28 | void REEFServer::shutdown() { 29 | ASSERT(rpc_server.get() != nullptr); 30 | rpc_server->Shutdown(); 31 | scheduler->shutdown(); 32 | } 33 | 34 | grpc::Status REEFServer::SetPriority( 35 | grpc::ServerContext *context, 36 | const reef::rpc::SetPriorityRequest *request, 37 | reef::rpc::SetPriorityReply *reply 38 | ) { 39 | LOG(INFO) << "new client, real_time: " << request->rt(); 40 | // create queue 41 | REEFScheduler::QueueID qid; 42 | Status s = scheduler->create_queue( 43 | request->rt() ? 44 | REEFScheduler::TaskQueueType::RealTimeQueue 45 | : REEFScheduler::TaskQueueType::BestEffortQueue, 46 | qid 47 | ); 48 | if (s != Status::Succ) 49 | reply->set_succ(false); 50 | else { 51 | reply->set_succ(true); 52 | reply->set_qid(qid); 53 | } 54 | return grpc::Status::OK; 55 | } 56 | 57 | grpc::Status REEFServer::LoadModel( 58 | grpc::ServerContext *context, 59 | const reef::rpc::LoadModelRequest *request, 60 | reef::rpc::LoadModelReply *reply 61 | ) { 62 | LOG(INFO) << "load model: " << request->name() << ", qid: " << request->qid(); 63 | std::string prefix = request->dir() + "/" + request->name(); 64 | std::string param_file = prefix + ".param"; 65 | if (access(param_file.c_str(), F_OK) == -1) { 66 | param_file = ""; 67 | LOG(INFO) << request->name() << " no param file"; 68 | } 69 | 70 | REEFScheduler::ModelID mid; 71 | Status s = scheduler->load_model( 72 | prefix + ".trans.co", 73 | prefix + ".be.co", 74 | prefix + ".json", 75 | prefix + ".profile.json", 76 | param_file, // TODO: load param 77 | mid 78 | ); 79 | if (s != Status::Succ) { 80 | reply->set_succ(false); 81 | return grpc::Status::OK; 82 | } else { 83 | reply->set_mid(mid); 84 | } 85 | s = scheduler->bind_model_queue(request->qid(), mid); 86 | if (s != Status::Succ) { 87 | reply->set_succ(false); // TODO: unload model 88 | return grpc::Status::OK; 89 | } else { 90 | reply->set_succ(true); 91 | } 92 | return grpc::Status::OK; 93 | } 94 | 95 | grpc::Status REEFServer::RegisterBlob( 96 | grpc::ServerContext *context, 97 | const reef::rpc::RegisterBlobRequest *request, 98 | reef::rpc::RegisterBlobReply *reply 99 | ) { 100 | reply->set_succ(false); 101 | size_t size; 102 | auto s = scheduler->get_data_size(request->mid(), request->name(), size); 103 | if (s != Status::Succ) return grpc::Status::OK; 104 | std::string key = std::to_string(request->mid()) + "_" + request->name(); 105 | reply->set_key(key); 106 | reply->set_size(size); 107 | reply->set_succ(true); 108 | { 109 | std::unique_lock lock(shm_mtx); 110 | auto iter = shms.find(key); 111 | if (iter == shms.end()) { 112 | auto shm = std::make_shared(key, size, true); 113 | SharedMemoryInfo shminfo; 114 | shminfo.name = request->name(); 115 | shminfo.mid = request->mid(); 116 | shminfo.shm = shm; 117 | shms.insert({key, shminfo}); 118 | } 119 | } 120 | return grpc::Status::OK; 121 | } 122 | 123 | grpc::Status REEFServer::GetBlob( 124 | grpc::ServerContext *context, 125 | const reef::rpc::GetBlobRequest *request, 126 | reef::rpc::GetBlobReply *reply 127 | ) { 128 | SharedMemoryInfo shminfo; 129 | { 130 | std::unique_lock lock(shm_mtx); 131 | auto iter = shms.find(request->key()); 132 | if (iter == shms.end()) { 133 | reply->set_succ(false); 134 | return grpc::Status::OK; 135 | } 136 | shminfo = iter->second; 137 | } 138 | auto s = scheduler->get_output(shminfo.mid, shminfo.shm->data(), shminfo.shm->size(), shminfo.name); 139 | if (s != Status::Succ) { 140 | reply->set_succ(false); 141 | } else { 142 | reply->set_succ(true); 143 | } 144 | return grpc::Status::OK; 145 | } 146 | 147 | grpc::Status REEFServer::SetBlob( 148 | grpc::ServerContext *context, 149 | const reef::rpc::SetBlobRequest *request, 150 | reef::rpc::SetBlobReply *reply 151 | ) { 152 | SharedMemoryInfo shminfo; 153 | { 154 | std::unique_lock lock(shm_mtx); 155 | auto iter = shms.find(request->key()); 156 | if (iter == shms.end()) { 157 | reply->set_succ(false); 158 | return grpc::Status::OK; 159 | } 160 | shminfo = iter->second; 161 | } 162 | auto s = scheduler->set_input(shminfo.mid, shminfo.shm->data(), shminfo.shm->size(), shminfo.name); 163 | if (s != Status::Succ) { 164 | reply->set_succ(false); 165 | } else { 166 | reply->set_succ(true); 167 | } 168 | return grpc::Status::OK; 169 | } 170 | 171 | grpc::Status REEFServer::Infer( 172 | grpc::ServerContext *context, 173 | const reef::rpc::InferRequest *request, 174 | reef::rpc::InferReply *reply 175 | ) { 176 | REEFScheduler::TaskID tid; 177 | auto s = scheduler->new_task(request->mid(), tid); 178 | if (s != Status::Succ) { 179 | reply->set_succ(false); 180 | } else { 181 | s = scheduler->wait_task(tid); 182 | reply->set_succ(true); 183 | if (s != Status::Succ) 184 | reply->set_succ(false); 185 | reply->set_tid(tid); 186 | } 187 | return grpc::Status::OK; 188 | } 189 | 190 | 191 | } // namespace server 192 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/server/server.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "reef/util/common.h" 4 | #include "reef/util/shared_memory.h" 5 | #include "reef/rpc/reef.grpc.pb.h" 6 | #include "reef/server/scheduler.h" 7 | #include 8 | 9 | namespace reef { 10 | namespace server { 11 | 12 | 13 | class REEFServer final : public reef::rpc::REEFService::Service { 14 | public: 15 | REEFServer(const std::string& addr); 16 | virtual ~REEFServer() {} 17 | void run(); 18 | 19 | void wait(); 20 | 21 | void shutdown(); 22 | 23 | REEFScheduler* get_scheduler() const { 24 | return scheduler.get(); 25 | } 26 | 27 | private: 28 | // RPC handles 29 | grpc::Status SetPriority( 30 | grpc::ServerContext *context, 31 | const reef::rpc::SetPriorityRequest *request, 32 | reef::rpc::SetPriorityReply *reply 33 | ) override; 34 | 35 | grpc::Status LoadModel( 36 | grpc::ServerContext *context, 37 | const reef::rpc::LoadModelRequest *request, 38 | reef::rpc::LoadModelReply *reply 39 | ) override; 40 | 41 | grpc::Status RegisterBlob( 42 | grpc::ServerContext *context, 43 | const reef::rpc::RegisterBlobRequest *request, 44 | reef::rpc::RegisterBlobReply *reply 45 | ) override; 46 | 47 | grpc::Status GetBlob( 48 | grpc::ServerContext *context, 49 | const reef::rpc::GetBlobRequest *request, 50 | reef::rpc::GetBlobReply *reply 51 | ) override; 52 | 53 | grpc::Status SetBlob( 54 | grpc::ServerContext *context, 55 | const reef::rpc::SetBlobRequest *request, 56 | reef::rpc::SetBlobReply *reply 57 | ) override; 58 | 59 | grpc::Status Infer( 60 | grpc::ServerContext *context, 61 | const reef::rpc::InferRequest *request, 62 | reef::rpc::InferReply *reply 63 | ) override; 64 | 65 | private: 66 | std::string server_addr; 67 | std::unique_ptr rpc_server; 68 | std::unique_ptr scheduler; 69 | std::mutex shm_mtx; 70 | struct SharedMemoryInfo { 71 | std::string name; 72 | std::shared_ptr shm; 73 | REEFScheduler::ModelID mid; 74 | }; 75 | std::unordered_map shms; 76 | }; 77 | 78 | } // namespace server 79 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/util/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define DEFAULT_REEF_ADDR "localhost:34543" 9 | 10 | #ifndef RESOURCE_DIR 11 | #define RESOURCE_DIR "../resource" 12 | #endif 13 | 14 | #define ASSERT(condition)\ 15 | do { \ 16 | if (! (condition)) { \ 17 | std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ 18 | << ":" << __LINE__ << std::endl; \ 19 | std::terminate(); \ 20 | } \ 21 | } while (false) 22 | 23 | #define ASSERT_STATUS(cmd) ASSERT(cmd == Status::Succ) 24 | 25 | #define RETURN_STATUS(cmd) \ 26 | {\ 27 | Status s = cmd;\ 28 | if (s != Status::Succ) {\ 29 | LOG(ERROR) << #cmd " error, " << __FILE__ << ":" << __LINE__; \ 30 | return s;\ 31 | }\ 32 | } 33 | 34 | #define ASSERT_MSG(condition, message) \ 35 | do { \ 36 | if (! (condition)) { \ 37 | std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ 38 | << ":" << __LINE__ << " msg: " << message << std::endl; \ 39 | std::terminate(); \ 40 | } \ 41 | } while (false) 42 | 43 | 44 | namespace reef { 45 | 46 | enum Status { 47 | Succ, 48 | Fail, 49 | NotFound, 50 | OutOfRange, 51 | Full 52 | }; 53 | 54 | template 55 | T align_up(T value, T alignment) { 56 | T temp = value % alignment; 57 | return temp == 0? value : value - temp + alignment; 58 | } 59 | 60 | template 61 | T align_down(T value, T alignment) { 62 | return value - value % alignment; 63 | } 64 | } 65 | 66 | -------------------------------------------------------------------------------- /src/reef/util/json.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "reef/util/json.h" 5 | 6 | namespace reef { 7 | namespace util { 8 | 9 | #define IS_DIGIT(chara) ((chara) <= '9' && (chara) >= '0') 10 | #define IS_SPACE(chara) ((chara) == ' ' || (chara) == '\n' || (chara) == '\t') 11 | 12 | #define MATCH_CONDITION(iter, str, delim) \ 13 | ((iter) < (str).length() && ((str)[iter] != delim || (str)[iter-1] == '\\')) 14 | 15 | #define TOKENIZE_SEPARATEOR(iter, type, sep, tokens) \ 16 | case sep: \ 17 | tokens.push_back(JsonParser::token(type)); \ 18 | continue; 19 | 20 | #define TOKENIZE_COUPLED(iter, str, type, sep, tokens) \ 21 | case sep: \ 22 | { \ 23 | size_t _tmp = iter + 1; \ 24 | for (; MATCH_CONDITION(_tmp, str, sep); _tmp++) ; \ 25 | tokens.push_back(JsonParser::token(type, str.substr(iter+1, _tmp-iter-1))); \ 26 | iter = _tmp; \ 27 | continue; \ 28 | } 29 | 30 | const char* token_name[] = {"invalid", "string", "number", "[", "]", "{", "}", ",", ":"}; 31 | 32 | 33 | JsonObject* JsonParser::parse(std::string& str) { 34 | int top = 0; 35 | JsonObject* jobj = _parse(tokenize(str), top); 36 | return jobj; 37 | } 38 | 39 | std::vector JsonParser::tokenize(std::string& str) { 40 | std::vector tokens; 41 | 42 | while (!str.empty()) { 43 | strip_space(str); 44 | std::string token = split_by_space(str); 45 | 46 | for (size_t i = 0; i < token.length(); i++) { 47 | switch (token[i]) { 48 | TOKENIZE_COUPLED(i, token, STRING, '"', tokens); 49 | TOKENIZE_COUPLED(i, token, STRING, '\'', tokens); 50 | TOKENIZE_SEPARATEOR(i, COMMA, ',', tokens); 51 | TOKENIZE_SEPARATEOR(i, LBRACKET, '[', tokens); 52 | TOKENIZE_SEPARATEOR(i, RBRACKET, ']', tokens); 53 | TOKENIZE_SEPARATEOR(i, LBRACE, '{', tokens); 54 | TOKENIZE_SEPARATEOR(i, RBRACE, '}', tokens); 55 | TOKENIZE_SEPARATEOR(i, COLON, ':', tokens); 56 | } 57 | 58 | if (token[i] == '-' || IS_DIGIT(token[i])) { 59 | size_t tmp = (token[i] == '-') ? (i + 1) : i; 60 | bool is_float = false; 61 | for (; tmp < token.length() && (IS_DIGIT(token[tmp]) || token[tmp] == '.'); tmp++) 62 | if (token[tmp] == '.') is_float = true; 63 | tokens.push_back(JsonParser::token(is_float ? FLOAT : INTEGER, token.substr(i, tmp-i))); 64 | i = tmp - 1; 65 | continue; 66 | } 67 | 68 | printf("Error: unrecognizable token at %s\n", token.substr(i).c_str()); 69 | exit(1); 70 | } 71 | } 72 | 73 | return tokens; 74 | } 75 | 76 | JsonObject* JsonParser::_parse(std::vector tokens, int& top) { 77 | JsonObject* cur = new JsonObject; 78 | 79 | switch (tokens[top].type) { 80 | case LBRACE: 81 | cur->type = JsonObject::J_DICT; 82 | top++; 83 | while (tokens[top].type != RBRACE) { 84 | assert(tokens[top].type == STRING); 85 | std::string key = tokens[top].value; 86 | 87 | assert(tokens[top+1].type == COLON); 88 | top += 2; 89 | 90 | cur->mval.insert(std::pair(key, _parse(tokens, top))); 91 | if (tokens[top].type == COMMA) top++; 92 | } 93 | top++; 94 | return cur; 95 | 96 | case LBRACKET: 97 | cur->type = JsonObject::J_LIST; 98 | top++; 99 | while (tokens[top].type != RBRACKET) { 100 | cur->lval.push_back(_parse(tokens, top)); 101 | if (tokens[top].type == COMMA) top++; 102 | } 103 | top++; 104 | return cur; 105 | 106 | case INTEGER: 107 | cur->type = JsonObject::J_INT; 108 | cur->ival = atoi(tokens[top].value.c_str()); 109 | top++; 110 | return cur; 111 | case FLOAT: 112 | cur->type = JsonObject::J_FLOAT; 113 | cur->fval = (float)atof(tokens[top].value.c_str()); 114 | top++; 115 | return cur; 116 | case STRING: 117 | cur->type = JsonObject::J_STRING; 118 | cur->sval = tokens[top].value; 119 | top++; 120 | return cur; 121 | default: 122 | break; 123 | } 124 | 125 | return cur; 126 | } 127 | 128 | void JsonParser::strip_space(std::string& str) { 129 | 130 | for (size_t i = 0; i < str.length(); i++) { 131 | if (!IS_SPACE(str[i])) { 132 | str.erase(0, i); 133 | return; 134 | } 135 | } 136 | } 137 | 138 | std::string JsonParser::split_by_space(std::string& str) { 139 | std::string token; 140 | 141 | for (size_t i = 0; i < str.length(); i++) { 142 | switch (str[i]) { 143 | case '"': 144 | for (i++; MATCH_CONDITION(i, str, '"'); i++) ; 145 | break; 146 | case '\'': 147 | for (i++; MATCH_CONDITION(i, str, '\''); i++) ; 148 | break; 149 | case ' ': 150 | case '\n': 151 | case '\t': 152 | token = str.substr(0, i); 153 | str.erase(0, i); 154 | return token; 155 | default: 156 | continue; 157 | } 158 | } 159 | 160 | token = str; 161 | str.erase(0, str.length()); 162 | return token; 163 | } 164 | } // namespace util 165 | } // namespace reef 166 | 167 | -------------------------------------------------------------------------------- /src/reef/util/json.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace reef { 8 | namespace util { 9 | 10 | // A json object, which supports `string`, `int`, `float`, `list` and `dict(map)`. 11 | class JsonObject { 12 | public: 13 | std::string sval; 14 | uint32_t ival; 15 | float fval; 16 | std::vector lval; 17 | std::map mval; 18 | 19 | enum jobject_type {J_STRING, J_INT, J_FLOAT, J_LIST, J_DICT}; 20 | jobject_type type; 21 | 22 | JsonObject() {} 23 | }; 24 | 25 | // A json parser that parses a string to a JsonObject. 26 | class JsonParser { 27 | public: 28 | // parses a string to a JsonObject 29 | static JsonObject* parse(std::string& str); 30 | 31 | private: 32 | enum token_type {INVAL, STRING, FLOAT, INTEGER, LBRACKET, RBRACKET, LBRACE, RBRACE, COMMA, COLON}; 33 | struct token { 34 | token_type type; 35 | std::string value; 36 | token(token_type t, std::string v="") : type(t), value(v) {} 37 | }; 38 | 39 | static std::vector tokenize(std::string& str); 40 | static JsonObject* _parse(std::vector tokens, int& top); 41 | 42 | static void strip_space(std::string& str); 43 | static std::string split_by_space(std::string& str); 44 | }; 45 | 46 | } // namespace reef 47 | } // namespace util -------------------------------------------------------------------------------- /src/reef/util/shared_memory.cpp: -------------------------------------------------------------------------------- 1 | #include "reef/util/shared_memory.h" 2 | #include "reef/util/common.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace reef { 11 | namespace util { 12 | 13 | SharedMemory::SharedMemory( 14 | const std::string& __key, size_t __size, bool create 15 | ) : _key(__key), _size(__size), _create(create) 16 | { 17 | _fd = shm_open( 18 | __key.c_str(), 19 | create ? O_CREAT|O_RDWR : O_RDWR, 20 | 0777 21 | ); 22 | ASSERT(_fd >= 0); 23 | ASSERT(ftruncate(_fd, _size) >= 0); 24 | _data = mmap(NULL, _size, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, 0); 25 | ASSERT(_data != nullptr); 26 | } 27 | 28 | SharedMemory::~SharedMemory() { 29 | close(_fd); 30 | if (_create) { 31 | shm_unlink(_key.c_str()); 32 | } 33 | } 34 | 35 | void* SharedMemory::data() { 36 | return _data; 37 | } 38 | 39 | size_t SharedMemory::size() { 40 | return _size; 41 | } 42 | 43 | } // namespace util 44 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/util/shared_memory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace reef { 6 | namespace util { 7 | 8 | class SharedMemory { 9 | public: 10 | SharedMemory(const std::string& __key, size_t __size, bool create=false); 11 | ~SharedMemory(); 12 | 13 | void* data(); 14 | size_t size(); 15 | 16 | private: 17 | int _fd; 18 | std::string _key; 19 | size_t _size; 20 | void* _data; 21 | bool _create; 22 | 23 | }; 24 | 25 | 26 | } // namespace util 27 | } // namespace reef -------------------------------------------------------------------------------- /src/reef/util/threadsafe_queue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | namespace reef { 9 | // TODO: replace it with a lock-free queue 10 | template 11 | class ThreadSafeQueue { 12 | public: 13 | 14 | enum { Capacity = 10000 }; 15 | 16 | ThreadSafeQueue() : _tail(0), _head(0){ 17 | _array.resize(Capacity); 18 | } 19 | 20 | virtual ~ThreadSafeQueue() { 21 | } 22 | 23 | ThreadSafeQueue(const ThreadSafeQueue &queue) = delete; 24 | 25 | ThreadSafeQueue(ThreadSafeQueue && queue) noexcept { 26 | _tail.store(queue._tail.load()); 27 | _head.store(queue._head.load()); 28 | _array = std::move(queue._array); 29 | } 30 | 31 | /* Producer only: updates tail index after setting the element in place */ 32 | bool push(const Element& item) 33 | { 34 | // quick fix: lock the producers 35 | std::unique_lock lock(mtx); 36 | auto current_tail = _tail.load(); 37 | auto next_tail = increment(current_tail); 38 | if(next_tail != _head.load()) 39 | { 40 | _array[current_tail] = item; 41 | _tail.store(next_tail); 42 | return true; 43 | } 44 | 45 | return false; // full queue 46 | } 47 | 48 | /* Consumer only: updates head index after retrieving the element */ 49 | void pop() 50 | { 51 | std::unique_lock lock(mtx); 52 | const auto current_head = _head.load(); 53 | assert(current_head != _tail.load()); // empty queue 54 | _head.store(increment(current_head)); 55 | } 56 | 57 | Element& front() 58 | { 59 | std::unique_lock lock(mtx); 60 | const auto current_head = _head.load(); 61 | assert(current_head != _tail.load()); // empty queue 62 | auto &item = _array[current_head]; 63 | return item; 64 | } 65 | 66 | bool empty() const { 67 | // std::unique_lock lock(mtx); 68 | return (_head.load() == _tail.load()); 69 | } 70 | 71 | bool full() const 72 | { 73 | const auto next_tail = increment(_tail.load()); 74 | return (next_tail == _head.load()); 75 | } 76 | 77 | private: 78 | size_t increment(size_t idx) const 79 | { 80 | return (idx + 1) % Capacity; 81 | } 82 | std::atomic _tail; 83 | std::vector _array; 84 | std::mutex mtx; 85 | std::atomic _head; 86 | }; 87 | } // namespace reef --------------------------------------------------------------------------------