├── CMakeLists.txt
├── CONTRIBUTORS.md
├── INSTALL.md
├── LICENSE
├── README.md
├── cmake
    └── common.cmake
├── env.sh
├── resource
    ├── Makefile
    ├── README.md
    ├── alexnet
    │   ├── alexnet.be.cu
    │   ├── alexnet.cu
    │   ├── alexnet.json
    │   ├── alexnet.profile.json
    │   └── alexnet.trans.cu
    ├── bert
    │   ├── bert.be.cu
    │   ├── bert.cu
    │   ├── bert.json
    │   ├── bert.profile.json
    │   └── bert.trans.cu
    ├── densenet
    │   ├── densenet.be.cu
    │   ├── densenet.cu
    │   ├── densenet.json
    │   ├── densenet.profile.json
    │   └── densenet.trans.cu
    ├── inception
    │   ├── inception.be.cu
    │   ├── inception.cu
    │   ├── inception.json
    │   ├── inception.profile.json
    │   └── inception.trans.cu
    ├── mobilenet
    │   ├── mobilenet.be.cu
    │   ├── mobilenet.cu
    │   ├── mobilenet.json
    │   ├── mobilenet.profile.json
    │   └── mobilenet.trans.cu
    ├── mocked_kernel
    │   ├── mocked_kernel.be.cu
    │   ├── mocked_kernel.cu
    │   ├── mocked_kernel.json
    │   ├── mocked_kernel.profile.json
    │   └── mocked_kernel.trans.cu
    ├── resnet
    │   ├── resnet.be.cu
    │   ├── resnet.cu
    │   ├── resnet.json
    │   ├── resnet.profile.json
    │   └── resnet.trans.cu
    ├── resnet152
    │   ├── resnet152.be.cu
    │   ├── resnet152.cu
    │   ├── resnet152.json
    │   ├── resnet152.profile.json
    │   └── resnet152.trans.cu
    ├── resnet18
    │   ├── resnet18.be.cu
    │   ├── resnet18.cu
    │   ├── resnet18.json
    │   ├── resnet18.param
    │   ├── resnet18.profile.json
    │   └── resnet18.trans.cu
    └── vgg
    │   ├── vgg.be.cu
    │   ├── vgg.cu
    │   ├── vgg.json
    │   ├── vgg.profile.json
    │   └── vgg.trans.cu
├── script
    ├── best_effort_kernel.py
    ├── estimate_max_throughput.py
    ├── estimate_resource_usage.py
    ├── generate_asm_loop.py
    ├── generate_final_schedule.py
    ├── generate_register_hint.py
    ├── generate_shared_memory_usage.py
    ├── get_kernel_descriptor.py
    ├── get_kernel_occupancy.py
    ├── replace_raw_occupancy.py
    ├── replace_register_usage.py
    ├── transform_kernel.py
    └── tvm_generate_model.py
└── src
    ├── example
        ├── rpc_client.cpp
        ├── rpc_client_cont.cpp
        └── rpc_server.cpp
    └── reef
        ├── client
            ├── client.cpp
            └── client.h
        ├── executor
            ├── executor_base.cpp
            ├── executor_base.h
            ├── hip
            │   ├── hip_impl.cpp
            │   └── hip_impl.h
            ├── hybrid_executor.cpp
            ├── hybrid_executor.h
            ├── model.cpp
            ├── model.h
            ├── trans_executor.cpp
            └── trans_executor.h
        ├── protos
            └── reef.proto
        ├── rpc
            └── placeholder
        ├── server
            ├── scheduler.cpp
            ├── scheduler.h
            ├── server.cpp
            └── server.h
        ├── test
            └── test.cpp
        └── util
            ├── common.h
            ├── json.cpp
            ├── json.h
            ├── shared_memory.cpp
            ├── shared_memory.h
            └── threadsafe_queue.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | project(reef)
  2 | cmake_minimum_required(VERSION 3.2)
  3 | 
  4 | add_compile_options(-std=c++11)
  5 | add_definitions(-D__REEF_HIP_GPU__)
  6 | add_definitions(-DRESOURCE_DIR="${CMAKE_CURRENT_LIST_DIR}/resource")
  7 | 
  8 | 
  9 | # GRPC and Protocol Buffers libraries location
 10 | list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
 11 | 
 12 | # Cmake find modules
 13 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
 14 | 
 15 | # For grpc 
 16 | 
 17 | include("${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake")
 18 | 
 19 | get_filename_component(reef_proto "${CMAKE_CURRENT_LIST_DIR}/src/reef/protos/reef.proto" ABSOLUTE)
 20 | get_filename_component(reef_proto_path "${reef_proto}" PATH)
 21 | set(GRPC_GENERATE_DIR "${PROJECT_SOURCE_DIR}/src/reef/rpc" )
 22 | set(reef_proto_srcs "${GRPC_GENERATE_DIR}/reef.pb.cc")
 23 | set(reef_proto_hdrs "${GRPC_GENERATE_DIR}/reef.pb.h")
 24 | set(reef_grpc_srcs "${GRPC_GENERATE_DIR}/reef.grpc.pb.cc")
 25 | set(reef_grpc_hdrs "${GRPC_GENERATE_DIR}/reef.grpc.pb.h")
 26 | add_custom_command(
 27 |       OUTPUT "${reef_proto_srcs}" "${reef_proto_hdrs}" "${reef_grpc_srcs}" "${reef_grpc_hdrs}"
 28 |       COMMAND ${_PROTOBUF_PROTOC}
 29 |       ARGS --grpc_out "${GRPC_GENERATE_DIR}"
 30 |         --cpp_out "${GRPC_GENERATE_DIR}"
 31 |         -I "${reef_proto_path}"
 32 |         --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
 33 |         "${reef_proto}"
 34 |       DEPENDS "${reef_proto}")
 35 | 
 36 | include_directories("${GRPC_GENERATE_DIR}")
 37 | 
 38 | # reef_grpc_proto
 39 | add_library(reef_grpc_proto
 40 |   ${reef_grpc_srcs}
 41 |   ${reef_grpc_hdrs}
 42 |   ${reef_proto_srcs}
 43 |   ${reef_proto_hdrs})
 44 | target_link_libraries(reef_grpc_proto
 45 |   ${_REFLECTION}
 46 |   ${_GRPC_GRPCPP}
 47 |   ${_PROTOBUF_LIBPROTOBUF})
 48 | 
 49 | 
 50 | # REEF codes
 51 | 
 52 | find_package(hip REQUIRED)
 53 | 
 54 | find_package(GTest REQUIRED)
 55 | 
 56 | find_package(glog REQUIRED)
 57 | 
 58 | SET(CMAKE_CXX_COMPILER "/opt/rocm/bin/hipcc")
 59 | 
 60 | 
 61 | set(CMAKE_CXX_FLAGS "-g -O0 ${CMAKE_CXX_FLAGS}")
 62 | 
 63 | include_directories(${HIP_INCLUDE_DIRS})
 64 | 
 65 | include_directories("${PROJECT_SOURCE_DIR}/src")
 66 | 
 67 | 
 68 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/client" client)
 69 | 
 70 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/executor" executor)
 71 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/executor/hip" hip_impl)
 72 | 
 73 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/server" server)
 74 | 
 75 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/util" util)
 76 | 
 77 | AUX_SOURCE_DIRECTORY("${PROJECT_SOURCE_DIR}/src/reef/test" test)
 78 | 
 79 | add_library(reef_util
 80 |   ${util}
 81 | )
 82 | 
 83 | add_library(reef_server_lib
 84 |   ${server}
 85 |   ${executor}
 86 |   ${hip_impl}
 87 | )
 88 | 
 89 | add_library(reef_client_lib
 90 |   ${client}
 91 | )
 92 | 
 93 | add_executable(unit_test 
 94 |   ${test}
 95 | )
 96 | 
 97 | target_link_libraries(unit_test
 98 |     reef_util
 99 |     reef_server_lib
100 |     reef_client_lib
101 |     reef_grpc_proto
102 |     glog::glog
103 |     ${GTEST_BOTH_LIBRARIES}
104 |     ${_REFLECTION}
105 |     ${_GRPC_GRPCPP}
106 |     ${_PROTOBUF_LIBPROTOBUF}
107 |     pthread
108 | )
109 | 
110 | function (add_executable_app app_name app_path)
111 |     add_executable(${app_name}
112 |         ${app_path}
113 |     )
114 |     target_link_libraries(${app_name}
115 |       reef_util
116 |       reef_server_lib
117 |       reef_client_lib
118 |       reef_grpc_proto
119 |       glog::glog
120 |       ${GTEST_BOTH_LIBRARIES}
121 |       ${_REFLECTION}
122 |       ${_GRPC_GRPCPP}
123 |       ${_PROTOBUF_LIBPROTOBUF}
124 |       pthread
125 |     )
126 | endfunction()
127 | 
128 | add_executable_app(reef_client "${PROJECT_SOURCE_DIR}/src/example/rpc_client.cpp")
129 | add_executable_app(reef_client_cont "${PROJECT_SOURCE_DIR}/src/example/rpc_client_cont.cpp")
130 | add_executable_app(reef_server "${PROJECT_SOURCE_DIR}/src/example/rpc_server.cpp")
131 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | # REEF Contributors
2 | 
3 | **Mingcong Han**, mingconghan@sjtu.edu.cn, *Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University; Shanghai AI Laboratory;*
4 | 
5 | **Hanze Zhang**, hanzezhang@sjtu.edu.cn, *Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University; Shanghai AI Laboratory;*
6 | 
7 | **Rong Chen**, rongchen@sjtu.edu.cn, *Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University; Shanghai AI Laboratory;*
8 | 
9 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
  1 | # REEF Installation
  2 | 
  3 | ## Software Version
  4 | * Ubuntu 18.04
  5 | * ROCm 4.3.0
  6 | * CMake > 3.18
  7 | * grpc 1.45
  8 | * glog 0.6.0
  9 | * googletest 1.11.0 
 10 | 
 11 | ## Installation Overview
 12 | 
 13 | The installation has six major steps:
 14 | 1. Install ROCm-4.3
 15 | 2. Install the customized GPU kernel driver (for reset-based preemption), and reboot
 16 | 3. (Recommended, but Optional) create the ROCm docker container
 17 | 4. Install the customized GPU runtime (hip, rocclr)
 18 | 5. Install other software dependencies (e.g., grpc)
 19 | 6. Build REEF
 20 | 
 21 | The customized GPU kernel driver and GPU runtime can be found [here](https://github.com/SJTU-IPADS/reef-artifacts/tree/master/reef-env).
 22 | 
 23 | ## Install Dependencies
 24 | 
 25 | ### Install ROCm-4.3
 26 | ```sh
 27 | # Ensure the system is up to date.
 28 | $ sudo apt update
 29 | $ sudo apt dist-upgrade
 30 | $ sudo apt install libnuma-dev
 31 | $ sudo reboot
 32 | 
 33 | # Add the ROCm apt repository.
 34 | $ sudo apt install wget gnupg2
 35 | $ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
 36 | $ echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.3/ ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
 37 | $ sudo apt update
 38 | 
 39 | # Install the ROCm package and reboot.
 40 | $ sudo apt install rocm-dkms && sudo reboot
 41 | 
 42 | # Add ROCm binaries to PATH.
 43 | $ echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/rocprofiler/bin:/opt/rocm/opencl/bin' | sudo tee -a /etc/profile.d/rocm.sh
 44 | ```
 45 | 
 46 | 
 47 | ### Build & Install the Customized Kernel Driver
 48 | ```sh
 49 | $ git clone https://github.com/SJTU-IPADS/reef-artifacts.git
 50 | $ cd reef-artifacts/reef-env/amdgpu-dkms
 51 | # Notice: The script will reboot
 52 | $ ./update-kern-module.sh
 53 | ```
 54 | 
 55 | ### Build & Install rocclr
 56 | ```sh
 57 | # in reef-artifacts/reef-env
 58 | $ export REEF_ENV_ROOT=`pwd`
 59 | $ cd rocclr
 60 | $ mkdir build
 61 | $ cd build
 62 | $ cmake -DOPENCL_DIR="${REEF_ENV_ROOT}/ROCm-OpenCL-Runtime" -DCMAKE_INSTALL_PREFIX=/opt/rocm/rocclr ..
 63 | $ sudo make install
 64 | ```
 65 | 
 66 | ### Build & Install hip
 67 | ```sh
 68 | # in reef-artifacts/reef-env
 69 | $ export REEF_ENV_ROOT=`pwd`
 70 | $ cd hip
 71 | $ mkdir build
 72 | $ cd build
 73 | $ cmake -DCMAKE_PREFIX_PATH="${REEF_ENV_ROOT}/rocclr/build;/opt/rocm/hip" ..
 74 | $ sudo make install
 75 | ```
 76 | 
 77 | ### Install CMake
 78 | ```sh
 79 | $ wget https://github.com/Kitware/CMake/releases/download/v3.22.4/cmake-3.22.4-linux-x86_64.sh
 80 | $ sh cmake-3.22.4-linux-x86_64
 81 | # you can also add this cmake version to ~/.bashrc
 82 | $ export PATH=~/cmake-3.22.4-linux-x86_64/bin:$PATH 
 83 | $ cmake --version
 84 | cmake version 3.22.4
 85 | ```
 86 | 
 87 | ### Install glog
 88 | ```sh
 89 | $ git clone https://github.com/google/glog
 90 | $ cd glog
 91 | $ mkdir build; cd build
 92 | $ cmake ..
 93 | $ sudo make install
 94 | ```
 95 | 
 96 | ### Install gtest
 97 | ```sh
 98 | $ git clone -b  https://github.com/google/googletest
 99 | $ cd googletest
100 | $ mkdir build; cd build
101 | $ cmake ..
102 | $ sudo make install
103 | ```
104 | 
105 | ### Install grpc + protobuf
106 | ```sh
107 | $ git clone -b 1.45.0 https://github.com/grpc/grpc
108 | $ cd grpc
109 | $ git submodule update --init
110 | $ mkdir -p cmake/build; cd cmake/build
111 | $ cmake ../..
112 | $ sudo make install
113 | ```
114 | 
115 | 
116 | ## Build REEF
117 | 
118 | ### Build Resource
119 | This step compiles the DNN models' device code.
120 | ```sh
121 | $ cd resource
122 | $ make
123 | ```
124 | 
125 | ### Build REEF
126 | ```sh
127 | $ mkdir build; cd build
128 | $ cmake ..
129 | $ make -j4
130 | ```
131 | 
132 | ### Run tests
133 | ```sh
134 | # in ./build
135 | $ ./unit_test
136 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # REEF - Real-time GPU-accelerated DNN Inference Scheduling System
  2 | 
  3 | REEF is a real-time GPU-accelerated DNN inference scheduling system that supports instant kernel preemption and controlled concurrent execution in GPU scheduling.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Introduction](#introduction)
  8 | - [Paper](#paper)
  9 | - [REEF Example](#reef-example)
 10 | - [Project Structure](#project-structure)
 11 | - [Hardware Requirement](#hardware-requirement)
 12 | - [Installation](#installation)
 13 | - [Artifact Evaluation](#artifact-evaluation)
 14 | 
 15 | 
 16 | ## Introduction
 17 | 
 18 | REEF is a real-time GPU-accelerated DNN inference scheduling system. 
 19 | REEF divides DNN inference tasks into two priorities: *real-time tasks(RT tasks)* and *best-effort tasks(BE tasks)*.
 20 | The scheduling goal of REEF is to minimize the latency of RT task and improve the throughput as much as possible.
 21 | 
 22 | REEF achieves such goal by providing two key techniques:
 23 | 
 24 | * *Reset-based Preemption:* BE tasks can be preempted in a few microseconds once a RT task arrives. The preemption is achieved by just killing
 25 | the running BE kernels and clearing the queued BE kernels, which is bases on the *idempotence* of DNN inference kernel.
 26 | 
 27 | * *Dynamic Kernel Padding(DKP):* BE tasks can be co-executed with the RT task by only using the CUs leftover from the RT kernels. This approach can improve the throughput and avoid starvation of BE tasks with minimal latency overhead on RT tasks.
 28 | 
 29 | ## REEF Example
 30 | 
 31 | After [building REEF](INSTALL.md), the example below can show how REEF works when there are concurrent tasks (one RT and multiple BEs).
 32 | 
 33 | First, start a REEF server.
 34 | ```bash
 35 | # in ./build
 36 | $ ./reef_server
 37 | ```
 38 | 
 39 | Then, start multiple BE clients. 
 40 | ```bash
 41 | # in ./build
 42 | $ for i in {1..4}; do ./reef_client_cont ../resource/resnet152 resnet152 0 0 & done
 43 | ```
 44 | You can see 4 BE clients are submitting BE tasks concurrently, the client will echo the inference latency of each task, e.g.:
 45 | ```
 46 | client 3 inference latency: 16.567 ms
 47 | client 2 inference latency: 29.347 ms
 48 | client 1 inference latency: 32.506 ms
 49 | client 0 inference latency: 24.848 ms
 50 | ```
 51 | 
 52 | Then, start a RT client, which submitting requests without pause.
 53 | ```bash
 54 | # in ./build
 55 | $ ./reef_client_cont ../resource/resnet152 resnet152 1 0
 56 | ```
 57 | 
 58 | You can see the RT client has the lowest inference latency.
 59 | ```
 60 | ...
 61 | client 4 inference latency: 12.743 ms
 62 | client 4 inference latency: 12.608 ms
 63 | client 4 inference latency: 12.944 ms
 64 | client 4 inference latency: 12.637 ms
 65 | ```
 66 | 
 67 | While, the BE task can still execute concurrently with RT task without affecting the performance of RT tasks.
 68 | ```
 69 | ...
 70 | client 2 inference latency: 48.183 ms
 71 | client 1 inference latency: 68.599 ms
 72 | client 0 inference latency: 34.857 ms
 73 | client 3 inference latency: 43.565 ms
 74 | ```
 75 | 
 76 | 
 77 | 
 78 | 
 79 | ## Project Structure
 80 | ```
 81 | > tree .
 82 | ├── cmake                     
 83 | ├── resource                      # DNN model resources for the evaluations
 84 | │   ├── resnet                    # DNN model for ResNet
 85 | │   │   ├── resnet.json           # The schedule graph (meta data) of the DNN model
 86 | │   │   ├── resnet.cu             # The raw GPU device code (GPU kernels) for the DNN model
 87 | │   │   ├── resnet.trans.cu       # The transformed GPU device code which supports dynamic kernel padding
 88 | │   │   ├── resnet.be.cu          # The transformed GPU devide code which supports reset-based preemption
 89 | │   │   ├── resnet.profile.json   # The profile of the kernel execution time
 90 | │   ├── densenet
 91 | │   ├── inception
 92 | ├── script                        # Utility scripts
 93 | └── src                           # source code
 94 | │   ├── example                   # REEF examples
 95 | │   └── reef                      # REEF source code
 96 | └── env.sh                        # Environment variables
 97 | │
 98 | ```
 99 | 
100 | ## Hardware Requirement
101 | 
102 | Currently, REEF only supports **AMD Radeon Instinct MI50 GPU**.
103 | 
104 | 
105 | ## Installation
106 | 
107 | see [INSTALL](INSTALL.md).
108 | 
109 | 
110 | ## Artifact Evaluation
111 | 
112 | For OSDI'22 atrifact evaluation, see [reef-artifacts](https://github.com/SJTU-IPADS/reef-artifacts).
113 | 
114 | ## Paper
115 | If you use REEF in your research, please cite our paper:
116 | ```bibtex
117 | @inproceedings {osdi2022reef,
118 |   author = {Mingcong Han and Hanze Zhang and Rong Chen and Haibo Chen},
119 |   title = {Microsecond-scale Preemption for Concurrent {GPU-accelerated} {DNN} Inferences},
120 |   booktitle = {16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)},
121 |   year = {2022},
122 |   isbn = {978-1-939133-28-1},
123 |   address = {Carlsbad, CA},
124 |   pages = {539--558},
125 |   url = {https://www.usenix.org/conference/osdi22/presentation/han},
126 |   publisher = {USENIX Association},
127 |   month = jul,
128 | }
129 | ```
130 | 
131 | ## The Team
132 | 
133 | REEF is developed and maintained by members from [IPADS@SJTU](https://github.com/SJTU-IPADS) and Shanghai AI Laboratory. See [Contributors](CONTRIBUTORS.md).
134 | 
135 | 
136 | ## License
137 | 
138 | REEF uses [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0.html).
139 | 


--------------------------------------------------------------------------------
/cmake/common.cmake:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 gRPC authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # cmake build file for C++ route_guide example.
 16 | # Assumes protobuf and gRPC have been installed using cmake.
 17 | # See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
 18 | # that automatically builds all the dependencies before building route_guide.
 19 | 
 20 | cmake_minimum_required(VERSION 3.5.1)
 21 | 
 22 | set (CMAKE_CXX_STANDARD 11)
 23 | 
 24 | if(MSVC)
 25 |   add_definitions(-D_WIN32_WINNT=0x600)
 26 | endif()
 27 | 
 28 | find_package(Threads REQUIRED)
 29 | 
 30 | if(GRPC_AS_SUBMODULE)
 31 |   # One way to build a projects that uses gRPC is to just include the
 32 |   # entire gRPC project tree via "add_subdirectory".
 33 |   # This approach is very simple to use, but the are some potential
 34 |   # disadvantages:
 35 |   # * it includes gRPC's CMakeLists.txt directly into your build script
 36 |   #   without and that can make gRPC's internal setting interfere with your
 37 |   #   own build.
 38 |   # * depending on what's installed on your system, the contents of submodules
 39 |   #   in gRPC's third_party/* might need to be available (and there might be
 40 |   #   additional prerequisites required to build them). Consider using
 41 |   #   the gRPC_*_PROVIDER options to fine-tune the expected behavior.
 42 |   #
 43 |   # A more robust approach to add dependency on gRPC is using
 44 |   # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
 45 | 
 46 |   # Include the gRPC's cmake build (normally grpc source code would live
 47 |   # in a git submodule called "third_party/grpc", but this example lives in
 48 |   # the same repository as gRPC sources, so we just look a few directories up)
 49 |   add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
 50 |   message(STATUS "Using gRPC via add_subdirectory.")
 51 | 
 52 |   # After using add_subdirectory, we can now use the grpc targets directly from
 53 |   # this build.
 54 |   set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 55 |   set(_REFLECTION grpc++_reflection)
 56 |   if(CMAKE_CROSSCOMPILING)
 57 |     find_program(_PROTOBUF_PROTOC protoc)
 58 |   else()
 59 |     set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
 60 |   endif()
 61 |   set(_GRPC_GRPCPP grpc++)
 62 |   if(CMAKE_CROSSCOMPILING)
 63 |     find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
 64 |   else()
 65 |     set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
 66 |   endif()
 67 | elseif(GRPC_FETCHCONTENT)
 68 |   # Another way is to use CMake's FetchContent module to clone gRPC at
 69 |   # configure time. This makes gRPC's source code available to your project,
 70 |   # similar to a git submodule.
 71 |   message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
 72 |   include(FetchContent)
 73 |   FetchContent_Declare(
 74 |     grpc
 75 |     GIT_REPOSITORY https://github.com/grpc/grpc.git
 76 |     # when using gRPC, you will actually set this to an existing tag, such as
 77 |     # v1.25.0, v1.26.0 etc..
 78 |     # For the purpose of testing, we override the tag used to the commit
 79 |     # that's currently under test.
 80 |     GIT_TAG        vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
 81 |   FetchContent_MakeAvailable(grpc)
 82 | 
 83 |   # Since FetchContent uses add_subdirectory under the hood, we can use
 84 |   # the grpc targets directly from this build.
 85 |   set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 86 |   set(_REFLECTION grpc++_reflection)
 87 |   set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
 88 |   set(_GRPC_GRPCPP grpc++)
 89 |   if(CMAKE_CROSSCOMPILING)
 90 |     find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
 91 |   else()
 92 |     set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
 93 |   endif()
 94 | else()
 95 |   # This branch assumes that gRPC and all its dependencies are already installed
 96 |   # on this system, so they can be located by find_package().
 97 | 
 98 |   # Find Protobuf installation
 99 |   # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
100 |   set(protobuf_MODULE_COMPATIBLE TRUE)
101 |   find_package(Protobuf CONFIG REQUIRED)
102 |   message(STATUS "Using protobuf ${Protobuf_VERSION}")
103 | 
104 |   set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
105 |   set(_REFLECTION gRPC::grpc++_reflection)
106 |   if(CMAKE_CROSSCOMPILING)
107 |     find_program(_PROTOBUF_PROTOC protoc)
108 |   else()
109 |     set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
110 |   endif()
111 | 
112 |   # Find gRPC installation
113 |   # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
114 |   find_package(gRPC CONFIG REQUIRED)
115 |   message(STATUS "Using gRPC ${gRPC_VERSION}")
116 | 
117 |   set(_GRPC_GRPCPP gRPC::grpc++)
118 |   if(CMAKE_CROSSCOMPILING)
119 |     find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
120 |   else()
121 |     set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
122 |   endif()
123 | endif()
124 | 


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib
2 | export PATH=$PATH:`pwd`/build
3 | export REEF_RESOURCE_DIR=`pwd`/resource
4 | 


--------------------------------------------------------------------------------
/resource/Makefile:
--------------------------------------------------------------------------------
 1 | # NVCC= nvcc
 2 | 
 3 | # INCLUDES=-I/usr/local/cuda/include -I/opt/rocm/hip/include
 4 | 
 5 | # LIBS=-lcuda
 6 | 
 7 | # ARCH=70
 8 | 
 9 | CC=hipcc
10 | 
11 | INCLUDES= -I/opt/rocm/hip/include
12 | 
13 | SUBDIRS = ${shell ls -d */ | sed 's/\///' -}
14 | 
15 | 
16 | CURRENT_PATH = ${shell pwd}
17 | 
18 | CURRENT_DIR = ${shell basename $(CURRENT_PATH)}
19 | 
20 | ARCH=gfx906
21 | 
22 | all: 
23 | 	@for dir in $(SUBDIRS); do \
24 | 		make -C $$dir -f ../Makefile build_subdir;\
25 | 	done
26 | 
27 | transform:
28 | 	@for dir in $(SUBDIRS); do \
29 | 	    make -C $$dir -f ../Makefile transform_subdir;\
30 | 	done
31 | 
32 | profile:
33 | 	for dir in $(SUBDIRS); do \
34 | 		echo "profiling $$dir"; \
35 | 	    profiler $$dir;\
36 | 	done
37 | 
38 | build_subdir: ${CURRENT_DIR}.raw.co ${CURRENT_DIR}.trans.co ${CURRENT_DIR}.be.co 
39 | 
40 | transform_subdir: ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s
41 | 	python3 ../../script/transform_kernel.py ${CURRENT_DIR}.cu ${CURRENT_DIR}.json ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s
42 | 	python3 ../../script/best_effort_kernel.py ${CURRENT_DIR}.cu
43 | 
44 | 
45 | 
46 | %.be.cu: %.cu
47 | 	python3 ../../script/best_effort_kernel.py ${CURRENT_DIR}.cu
48 | 
49 | %.preempt.cu: %.cu
50 | 	python3 ../../script/preemptable_kernel.py ${CURRENT_DIR}.cu ${CURRENT_DIR}.json	
51 | 
52 | %.trans.cu: %.cu ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s
53 | 	python3 ../../script/transform_kernel.py ${CURRENT_DIR}.cu ${CURRENT_DIR}.json ${CURRENT_DIR}-hip-amdgcn-amd-amdhsa-gfx906.s
54 | 
55 | %-hip-amdgcn-amd-amdhsa-gfx906.s: %.cu
56 | 	$(CC) $(INCLUDES) --save-temps --genco --offload-arch=${ARCH} $< -o $*.raw.co
57 | 	rm *.bc *.cui *.o *.out *.txt
58 | 
59 | %.raw.co: %.cu
60 | 	$(CC) $(INCLUDES) --save-temps --genco --offload-arch=${ARCH} $< -o $@
61 | 	rm *.bc *.cui *.o *.out *.txt
62 | 
63 | %.be.co: %.be.cu
64 | 	$(CC) $(INCLUDES) --genco --offload-arch=${ARCH} $< -o $@
65 | 
66 | %.trans.co: %.trans.cu
67 | 	@"/opt/rocm/llvm/bin/clang" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -E -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /opt/rocm-4.3.0/llvm/lib/clang/13.0.0 -internal-isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include/cuda_wrappers -internal-isystem /opt/rocm-4.3.0/include -include __clang_hip_runtime_wrapper.h -isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include/.. -isystem /opt/rocm-4.3.0/hsa/include -isystem /opt/rocm-4.3.0/hip/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /opt/rocm-4.3.0/llvm/lib/clang/13.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++11 -fdeprecated-macro -fno-autolink  -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -cuid=8a385692c0a935c2 -fcuda-allow-variadic-functions -munsafe-fp-atomics -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui -x hip $<
68 | 	@"/opt/rocm/llvm/bin/clang" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -emit-llvm-bc -emit-llvm-uselists -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /opt/rocm-4.3.0/llvm/lib/clang/13.0.0 -O3 -std=c++11 -fdeprecated-macro -fno-autolink  -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -disable-llvm-passes -cuid=8a385692c0a935c2 -fcuda-allow-variadic-functions -munsafe-fp-atomics -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc -x hip-cpp-output $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui
69 | 	@"/opt/rocm/llvm/bin/clang" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -S -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-4.3.0/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /opt/rocm-4.3.0/llvm/lib/clang/13.0.0 -O3 -std=c++11 -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -cuid=8a385692c0a935c2 -fcuda-allow-variadic-functions -munsafe-fp-atomics -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s -x ir $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc
70 | 	@python3 ../../script/replace_register_usage.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s
71 | 	python3 ../../script/generate_asm_loop.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s $*.json
72 | 	@"/opt/rocm/llvm/bin/clang" -cc1as -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -filetype obj -main-file-name $< -target-cpu gfx906 -dwarf-version=4 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s
73 | 	@"/opt/rocm/llvm/bin/lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -save-temps -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.out $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o
74 | 	@"/opt/rocm/llvm/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906 -inputs=/dev/null,$*.trans-hip-amdgcn-amd-amdhsa-gfx906.out -outputs=$@
75 | 	@echo build $@
76 | 	rm *.bc *.cui *.o *.out *.txt
77 | 	
78 | # @"/opt/rocm/llvm/bin/clang-13" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -E -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /opt/rocm/llvm/lib/clang/12.0.0 -internal-isystem /opt/rocm/llvm/lib/clang/12.0.0/include/cuda_wrappers -internal-isystem /opt/rocm/include -include __clang_hip_runtime_wrapper.h -isystem /opt/rocm/llvm/lib/clang/12.0.0/include/.. -isystem /opt/rocm/hsa/include -isystem /opt/rocm/hip/include -D __HIP_ROCclr__ -D __HIP_ROCclr__ -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/llvm/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /opt/rocm/llvm/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++11 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -fcuda-allow-variadic-functions -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui -x hip $<
79 | # @"/opt/rocm/llvm/bin/clang-13" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -emit-llvm-bc -emit-llvm-uselists -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /opt/rocm/llvm/lib/clang/12.0.0 -O3 -std=c++11 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -disable-llvm-passes -fcuda-allow-variadic-functions -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc -x hip-cpp-output $*.trans-hip-amdgcn-amd-amdhsa-gfx906.cui
80 | # @"/opt/rocm/llvm/bin/clang-13" -cc1 -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -S -save-temps=cwd -disable-free -disable-llvm-verifier -discard-value-names -main-file-name $< -mrelocation-model pic -pic-level 1 -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_906.bc -target-cpu gfx906 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /opt/rocm/llvm/lib/clang/12.0.0 -O3 -std=c++11 -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -fcuda-allow-variadic-functions -faddrsig -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s -x ir $*.trans-hip-amdgcn-amd-amdhsa-gfx906.bc
81 | # @python3 ../../script/replace_register_usage.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s
82 | # python3 ../../script/generate_asm_loop.py $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s $*.json
83 | # @"/opt/rocm/llvm/bin/clang-13" -cc1as -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -filetype obj -main-file-name $< -target-cpu gfx906 -dwarf-version=4 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s
84 | # @"/opt/rocm/llvm/bin/lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -save-temps -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.out $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o
85 | # @"/opt/rocm/llvm/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906 -inputs=/dev/null,$*.trans-hip-amdgcn-amd-amdhsa-gfx906.out -outputs=$@
86 | # @echo build $@
87 | # rm *.bc *.cui *.o *.out *.txt
88 | 
89 | %.asm: %-hip-amdgcn-amd-amdhsa-gfx906.s
90 | 	"/opt/rocm/llvm/bin/clang" -cc1as -mllvm --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -filetype obj -main-file-name $< -target-cpu gfx906 -dwarf-version=4 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.s
91 | 	"/opt/rocm/llvm/bin/lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -save-temps -o $*.trans-hip-amdgcn-amd-amdhsa-gfx906.out $*.trans-hip-amdgcn-amd-amdhsa-gfx906.o
92 | 	"/opt/rocm/llvm/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx906 -inputs=/dev/null,$*.trans-hip-amdgcn-amd-amdhsa-gfx906.out -outputs=$*.trans.co
93 | 
94 | clean_temp:
95 | 	rm -f */*.s */*.out */*.o */*.bc */*.txt */*.cui	
96 | 
97 | clean:
98 | 	rm -f */*.co */*.bc */*.cui */*.o */*.out */*.txt */*.s */*.pointer.cu */*.prtopt.cu */*.ptrraw.cu */*.preempt.cu


--------------------------------------------------------------------------------
/resource/README.md:
--------------------------------------------------------------------------------
 1 | # REEF Resource
 2 | 
 3 | This directory contains the device code of some DNN models.
 4 | 
 5 | ## Build
 6 | 
 7 | ```
 8 | $ make transform
 9 | $ make build
10 | ```
11 | 
12 | ## DNN Model
13 | 
14 | A DNN model that is loadable in REEF should contain three file:
15 | 
16 | 1. GPU device code (model.cu)
17 | 2. Kernel schedule (model.json)
18 | 3. Model parameter (model.param, Optional)
19 | 
20 | All of the files can be generated by TVM (a customized version). (TODO)
21 | 
22 | Other files will be generated automatically, including:
23 | 
24 | 1. GPU device code of RT tasks (model.trans.cu)
25 | 2. GPU device code of BE tasks (model.be.cu)
26 | 3. Kernel latency profile (model.profile.json)
27 | 
28 | ## Example
29 | 
30 | See the `mocked_kernel` directory.
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/resource/alexnet/alexnet.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "storage": [
  3 |         {
  4 |             "name": "data",
  5 |             "size": 150528,
  6 |             "stype": "float32"
  7 |         },
  8 |         {
  9 |             "name": "conv_1_weight",
 10 |             "size": 34848,
 11 |             "stype": "float32"
 12 |         },
 13 |         {
 14 |             "name": "conv_2_weight",
 15 |             "size": 614400,
 16 |             "stype": "float32"
 17 |         },
 18 |         {
 19 |             "name": "conv_3_weight",
 20 |             "size": 884736,
 21 |             "stype": "float32"
 22 |         },
 23 |         {
 24 |             "name": "conv_4_weight",
 25 |             "size": 1327104,
 26 |             "stype": "float32"
 27 |         },
 28 |         {
 29 |             "name": "conv_5_weight",
 30 |             "size": 884736,
 31 |             "stype": "float32"
 32 |         },
 33 |         {
 34 |             "name": "dense_1_weight",
 35 |             "size": 26214400,
 36 |             "stype": "float32"
 37 |         },
 38 |         {
 39 |             "name": "bias_1_weight",
 40 |             "size": 4096,
 41 |             "stype": "float32"
 42 |         },
 43 |         {
 44 |             "name": "dense_2_weight",
 45 |             "size": 16777216,
 46 |             "stype": "float32"
 47 |         },
 48 |         {
 49 |             "name": "bias_2_weight",
 50 |             "size": 4096,
 51 |             "stype": "float32"
 52 |         },
 53 |         {
 54 |             "name": "dense_3_weight",
 55 |             "size": 4096000,
 56 |             "stype": "float32"
 57 |         },
 58 |         {
 59 |             "name": "bias_3_weight",
 60 |             "size": 1000,
 61 |             "stype": "float32"
 62 |         },
 63 |         {
 64 |             "name": "null",
 65 |             "size": 279936,
 66 |             "stype": "float32"
 67 |         },
 68 |         {
 69 |             "name": "null",
 70 |             "size": 64896,
 71 |             "stype": "float32"
 72 |         },
 73 |         {
 74 |             "name": "output",
 75 |             "size": 6400,
 76 |             "stype": "float32"
 77 |         },
 78 |         {
 79 |             "name": "null",
 80 |             "size": 1000,
 81 |             "stype": "float32"
 82 |         }
 83 |     ],
 84 |     "kernels": [
 85 |         {
 86 |             "name": "fused_nn_conv2d_nn_relu_4_kernel0",
 87 |             "launch_params": [
 88 |                 27,
 89 |                 9,
 90 |                 12,
 91 |                 2,
 92 |                 6,
 93 |                 8
 94 |             ],
 95 |             "args": [
 96 |                 0,
 97 |                 1,
 98 |                 12
 99 |             ]
100 |         },
101 |         {
102 |             "name": "fused_nn_max_pool2d_2_kernel0",
103 |             "launch_params": [
104 |                 507,
105 |                 1,
106 |                 1,
107 |                 128,
108 |                 1,
109 |                 1
110 |             ],
111 |             "args": [
112 |                 12,
113 |                 13
114 |             ]
115 |         },
116 |         {
117 |             "name": "fused_nn_conv2d_nn_relu_3_kernel0",
118 |             "launch_params": [
119 |                 13,
120 |                 13,
121 |                 32,
122 |                 2,
123 |                 2,
124 |                 4
125 |             ],
126 |             "args": [
127 |                 13,
128 |                 2,
129 |                 12
130 |             ]
131 |         },
132 |         {
133 |             "name": "fused_nn_max_pool2d_1_kernel0",
134 |             "launch_params": [
135 |                 288,
136 |                 1,
137 |                 1,
138 |                 128,
139 |                 1,
140 |                 1
141 |             ],
142 |             "args": [
143 |                 12,
144 |                 13
145 |             ]
146 |         },
147 |         {
148 |             "name": "fused_nn_conv2d_nn_relu_2_kernel0",
149 |             "launch_params": [
150 |                 3,
151 |                 6,
152 |                 12,
153 |                 4,
154 |                 2,
155 |                 32
156 |             ],
157 |             "args": [
158 |                 13,
159 |                 3,
160 |                 12
161 |             ]
162 |         },
163 |         {
164 |             "name": "fused_nn_conv2d_nn_relu_1_kernel0",
165 |             "launch_params": [
166 |                 3,
167 |                 6,
168 |                 12,
169 |                 4,
170 |                 2,
171 |                 32
172 |             ],
173 |             "args": [
174 |                 12,
175 |                 4,
176 |                 13
177 |             ]
178 |         },
179 |         {
180 |             "name": "fused_nn_conv2d_nn_relu_kernel0",
181 |             "launch_params": [
182 |                 3,
183 |                 6,
184 |                 8,
185 |                 4,
186 |                 2,
187 |                 32
188 |             ],
189 |             "args": [
190 |                 13,
191 |                 5,
192 |                 12
193 |             ]
194 |         },
195 |         {
196 |             "name": "fused_nn_max_pool2d_kernel0",
197 |             "launch_params": [
198 |                 50,
199 |                 1,
200 |                 1,
201 |                 128,
202 |                 1,
203 |                 1
204 |             ],
205 |             "args": [
206 |                 12,
207 |                 13
208 |             ]
209 |         },
210 |         {
211 |             "name": "fused_nn_batch_flatten_kernel0",
212 |             "launch_params": [
213 |                 50,
214 |                 1,
215 |                 1,
216 |                 128,
217 |                 1,
218 |                 1
219 |             ],
220 |             "args": [
221 |                 14,
222 |                 13
223 |             ]
224 |         },
225 |         {
226 |             "name": "fused_nn_dense_add_nn_relu_1_kernel0",
227 |             "launch_params": [
228 |                 4096,
229 |                 1,
230 |                 1,
231 |                 64,
232 |                 1,
233 |                 1
234 |             ],
235 |             "args": [
236 |                 14,
237 |                 6,
238 |                 13,
239 |                 7
240 |             ]
241 |         },
242 |         {
243 |             "name": "fused_nn_dense_add_nn_relu_kernel0",
244 |             "launch_params": [
245 |                 4096,
246 |                 1,
247 |                 1,
248 |                 64,
249 |                 1,
250 |                 1
251 |             ],
252 |             "args": [
253 |                 13,
254 |                 8,
255 |                 14,
256 |                 9
257 |             ]
258 |         },
259 |         {
260 |             "name": "fused_nn_dense_add_kernel0",
261 |             "launch_params": [
262 |                 1000,
263 |                 1,
264 |                 1,
265 |                 64,
266 |                 1,
267 |                 1
268 |             ],
269 |             "args": [
270 |                 14,
271 |                 10,
272 |                 15,
273 |                 11
274 |             ]
275 |         },
276 |         {
277 |             "name": "fused_nn_softmax_kernel0",
278 |             "launch_params": [
279 |                 1,
280 |                 1,
281 |                 1,
282 |                 64,
283 |                 1,
284 |                 1
285 |             ],
286 |             "args": [
287 |                 15,
288 |                 14
289 |             ]
290 |         }
291 |     ],
292 |     "args": [
293 |         0,
294 |         1,
295 |         2,
296 |         3,
297 |         4,
298 |         5,
299 |         6,
300 |         7,
301 |         8,
302 |         9,
303 |         10,
304 |         11
305 |     ],
306 |     "shared_memory": {
307 |         "fused_nn_max_pool2d_kernel0": 4,
308 |         "fused_nn_softmax_kernel0": 4,
309 |         "fused_nn_conv2d_nn_relu_3_kernel0": 944,
310 |         "fused_nn_conv2d_nn_relu_1_kernel0": 9984,
311 |         "fused_nn_dense_add_kernel0": 4,
312 |         "fused_nn_max_pool2d_2_kernel0": 4,
313 |         "fused_nn_dense_add_nn_relu_1_kernel0": 4,
314 |         "fused_nn_conv2d_nn_relu_4_kernel0": 1356,
315 |         "fused_nn_max_pool2d_1_kernel0": 4,
316 |         "fused_nn_conv2d_nn_relu_2_kernel0": 9984,
317 |         "fused_nn_batch_flatten_kernel0": 4,
318 |         "fused_nn_conv2d_nn_relu_kernel0": 9984,
319 |         "fused_nn_dense_add_nn_relu_kernel0": 4
320 |     }
321 | }
322 | 


--------------------------------------------------------------------------------
/resource/alexnet/alexnet.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":8851,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":76, "latency":[42,40,40,42,42,44,44,48,50,50,54,58,58,64,64,68,74,72,78,80,84,88,92,92,96,78,74,74,84,74,98,76]},"fused_nn_dense_add_nn_relu_kernel0":{"total_latency":200, "latency":[42,40,40,40,42,42,44,46,48,50,54,58,58,62,64,70,72,74,78,82,84,90,90,94,98,100,102,108,108,112,118,120]},"fused_nn_conv2d_nn_relu_kernel0":{"total_latency":664, "latency":[276,458,664,876,1082,1292,1294]},"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_dense_add_nn_relu_1_kernel0":{"total_latency":168, "latency":[54,54,54,54,54,54,54,54,56,52,52,52,52,52,52,52,52,54,54,54,54,54,56,56,56,58,58,60,60,60,60,62]},"fused_nn_max_pool2d_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_conv2d_nn_relu_4_kernel0":{"total_latency":3884, "latency":[148,146,160,190,248,320,382,446,502,558,610,660,722,776,838,894]},"fused_nn_conv2d_nn_relu_3_kernel0":{"total_latency":2196, "latency":[336,340,348,356,366,378,390,408,380,398,402,416,422,442,452,466,478,494,510,524]},"fused_nn_conv2d_nn_relu_1_kernel0":{"total_latency":876, "latency":[276,456,664,876,1084,1294,1294]},"fused_nn_max_pool2d_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_max_pool2d_2_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"fused_nn_conv2d_nn_relu_2_kernel0":{"total_latency":588, "latency":[192,304,446,588,728,866,866]}}}


--------------------------------------------------------------------------------
/resource/bert/bert.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":5426,"kernel_latency":{"fused_reshape_add_multiply_erf_multiply_add_multiply_reshape_kernel0":{"total_latency":16, "latency":[14,14,14,16,16,16,16,18]},"fused_reshape_add_add_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_reshape_transpose_copy_reshape_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_batch_matmul_4_kernel0":{"total_latency":84, "latency":[62,66,72,82,92,112,120,130]},"fused_reshape_5_kernel0":{"total_latency":44, "latency":[30,32,34,36,38,42,44,42]},"fused_nn_softmax_1_kernel3":{"total_latency":48, "latency":[36,38,40,44,46,48,52,54]},"fused_variance_1_kernel1":{"total_latency":12, "latency":[12,12]},"fused_mean_1_kernel0":{"total_latency":16, "latency":[16,16]},"fused_reshape_4_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_batch_matmul_3_kernel0":{"total_latency":80, "latency":[68,72,80,94]},"fused_reshape_add_reshape_transpose_reshape_transpose_kernel0":{"total_latency":24, "latency":[16,16,18,20,22,24,24,24]},"fused_full_equal_reshape_kernel0":{"total_latency":12, "latency":[16,12]},"fused_mean_1_kernel1":{"total_latency":12, "latency":[12,12]},"fused_nn_softmax_1_kernel0":{"total_latency":52, "latency":[50,94,136,178,220,258,320,364]},"fused_cast_take_broadcast_to_like_cast_take_add_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,16]},"fused_variance_1_kernel0":{"total_latency":16, "latency":[16,18]},"fused_reshape_add_reshape_transpose_divide_reshape_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18]},"fused_reshape_add_reshape_transpose_transpose_reshape_transpose_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18]},"fused_subtract_add_sqrt_divide_multiply_add_1_kernel0":{"total_latency":20, "latency":[16,16,16,18,18,18,20,20]},"fused_nn_softmax_1_kernel1":{"total_latency":52, "latency":[36,38,42,48,48,52,58,62]},"fused_nn_batch_matmul_5_kernel0":{"total_latency":52, "latency":[20,20,22,24]},"fused_reshape_cast_broadcast_to_like_where_kernel0":{"total_latency":60, "latency":[48,48,50,52,54,58,60,60]},"fused_nn_softmax_1_kernel2":{"total_latency":52, "latency":[50,94,136,178,218,260,314,362]}}}


--------------------------------------------------------------------------------
/resource/densenet/densenet.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":3743,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,16,16,16,16,16,16,16,16,16,16,16,16]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[12]},"fused_nn_avg_pool2d_kernel0":{"total_latency":20, "latency":[18]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":40, "latency":[40]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":48, "latency":[46,48,50,52,52,54,56,60,60,64,66,70,76,78,84,86,94,96,100,102,110,112,116,120,126,128,132,136,144,146,148,152]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":16, "latency":[14,16]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":36, "latency":[36,36,38,38,38,40,38,40,40,42,42,42,46,46,48,48,50,50,50,52,52,54,54,56,58,60,62,62,64,66,66,68]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":36, "latency":[34,36,40,42,44,50,50,58,58,62,64,66,68,70,76,78,86,88,94,96,100,104,108,110,114,116,122,124]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":20, "latency":[20,20,22,22,24,26,28]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":660, "latency":[34,40,52,62,76,88,100,112]},"fused_nn_avg_pool2d_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,16,16,16,16,16]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,16,16,16,18,18,18]},"fused_nn_avg_pool2d_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,16]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":32, "latency":[32,32]},"fused_nn_conv2d_1_kernel0":{"total_latency":16, "latency":[14,14]},"fused_nn_conv2d_2_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,18]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":36, "latency":[34,36,38,42,44,50,50,56,56,58,62,64,66,70]},"fused_nn_avg_pool2d_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":40, "latency":[40,44,46,54]},"fused_nn_conv2d_kernel0":{"total_latency":16, "latency":[14,14,14,16]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":16, "latency":[14,14,14,14]}}}


--------------------------------------------------------------------------------
/resource/inception/inception.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":8273,"kernel_latency":{"fused_nn_softmax_1_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_dense_add_kernel0":{"total_latency":56, "latency":[26,26,26,30,30,32,34,36,38,40,42,44,44,50,50,48,56,50,52,54,56,56,60,62,62,52,54,52,54,54,60,54]},"fused_nn_avg_pool2d_6_kernel0":{"total_latency":20, "latency":[20,30,38,46,54,62,72,80]},"fused_nn_max_pool2d_9_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_34_kernel0":{"total_latency":108, "latency":[90,90,96,108,114,122,128,132,154,158,178,184,212,216,232,240,264,270,270,270,270,270,270,270,270,270,270,270,270,270,270,270]},"fused_concatenate_6_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_batch_flatten_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_36_kernel0":{"total_latency":72, "latency":[66,70,70,70,70,68,68,68,68,68,68,68,68,68,68,104]},"fused_nn_conv2d_add_nn_relu_33_kernel0":{"total_latency":48, "latency":[44,44,44,44,44,44,48,74]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":100, "latency":[94,120,162,162]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_5_kernel1":{"total_latency":44, "latency":[30,32,34,42,40,44,48,48,48,48,48,50]},"fused_nn_conv2d_add_nn_relu_32_kernel0":{"total_latency":60, "latency":[58,72,74,74]},"fused_nn_conv2d_add_nn_relu_31_kernel0":{"total_latency":60, "latency":[60,76,106,128]},"fused_nn_conv2d_add_nn_relu_29_kernel0":{"total_latency":48, "latency":[46,50,54,60,66,74,74,74,74,74,74,74,74,74,74,74]},"fused_nn_conv2d_add_nn_relu_30_kernel0":{"total_latency":60, "latency":[60,72,74,74]},"fused_concatenate_7_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_28_kernel0":{"total_latency":88, "latency":[82,100,108,124,146,148,146,146,146,146,146,146,146,146,146,146]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":104, "latency":[76,88,102,130,138,138,138,138,138,138,138,138,138,138,138,138]},"fused_nn_conv2d_add_nn_relu_26_kernel0":{"total_latency":116, "latency":[90,112,132,132]},"fused_nn_conv2d_add_nn_relu_25_kernel0":{"total_latency":92, "latency":[80,100,118]},"fused_nn_conv2d_add_nn_relu_22_kernel0":{"total_latency":188, "latency":[126,146,178,216,238,232,228,230]},"fused_concatenate_8_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16]},"fused_nn_avg_pool2d_10_kernel0":{"total_latency":20, "latency":[14,14,14,14,14,14,16,16]},"fused_nn_conv2d_add_nn_relu_19_kernel0":{"total_latency":144, "latency":[112,118,130,142,146,168,172,186,198,216,222,236,254,264,286,298,328,340,356,368,392,406,422,432]},"fused_nn_avg_pool2d_11_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_20_kernel0":{"total_latency":80, "latency":[78,78]},"fused_nn_conv2d_add_nn_relu_17_kernel0":{"total_latency":132, "latency":[104,118,136,130,130,150,170,192,222,242,266,284,308,326]},"fused_nn_conv2d_add_nn_relu_18_kernel0":{"total_latency":80, "latency":[78,80,82,86,86,96]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":76, "latency":[64,82,90]},"fused_nn_max_pool2d_8_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":32, "latency":[30,36]},"fused_nn_max_pool2d_5_kernel0":{"total_latency":28, "latency":[14,16,16,16,18,18,18,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,18,16,16,16,16,16,18,18,18,18,18]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":60, "latency":[56,78,104]},"fused_nn_conv2d_add_nn_relu_27_kernel0":{"total_latency":188, "latency":[142,152,162,190,190,240,234,286,266,300,304,326,350,366,400,416,460,472,506,522]},"fused_nn_avg_pool2d_9_kernel0":{"total_latency":28, "latency":[16,16,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel2":{"total_latency":44, "latency":[14,14,14,16,16,18,18,18,20,20,20,20,22,22,24,24]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":32, "latency":[26,28,30,30,32,36]},"fused_nn_conv2d_add_nn_relu_16_kernel0":{"total_latency":80, "latency":[78]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,14,16,16,16,16,16,16,18,18,18,18,20,20,20]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":48, "latency":[46,48]},"fused_nn_max_pool2d_7_kernel0":{"total_latency":20, "latency":[16,16,16,16,18,18,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_5_kernel2":{"total_latency":16, "latency":[16,16,16,18,18,18,18,20]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":176, "latency":[100,106,114,136,140,156,176,190,220,234,262,274,310,322,346,358]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":32, "latency":[30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_4_kernel0":{"total_latency":16, "latency":[16,16,16,16,18,18,18,18,20,20,20,22,22,22,24,24]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":32, "latency":[30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel2":{"total_latency":56, "latency":[14,14,14,16,16,16,18,18,18,18,18,18,20,20,20,22]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel0":{"total_latency":32, "latency":[14,14,14,16,16,16,16,18,18,18,18,18,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel2":{"total_latency":32, "latency":[14,14,14,16,16,16,16,18,18,18,18,20,20,20,20,22]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel0":{"total_latency":36, "latency":[14,16,16,16,16,18,18,18,20,20,22,22,22,24,24,26]},"fused_nn_conv2d_add_nn_relu_23_kernel0":{"total_latency":80, "latency":[76,80,82,80,86,96]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel1":{"total_latency":88, "latency":[20,22,24,26,28]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":40, "latency":[38,42]},"fused_nn_conv2d_add_nn_relu_24_kernel0":{"total_latency":168, "latency":[120,130,140,156,166,206,204,252,244,276,274,304,314,328,326,326]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":44, "latency":[42,40,42,44,46]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel1":{"total_latency":100, "latency":[30,32,36,38,44,48]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_4_kernel1":{"total_latency":32, "latency":[22,24,24,26,26,30,30,32,34,34]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_4_kernel2":{"total_latency":16, "latency":[14,16,16,16,16,18,18,18,18,18,18,18,18,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,18,18,18,18,20,20,20,22,22,22,24]},"fused_concatenate_10_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel1":{"total_latency":40, "latency":[30,32,38,38,42]},"fused_nn_avg_pool2d_8_kernel0":{"total_latency":20, "latency":[14,14,14,14,14,14,16,16]},"fused_concatenate_11_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,18]},"fused_nn_conv2d_add_nn_relu_35_kernel0":{"total_latency":132, "latency":[94,110,122,132,132,156,160,174,182,192,180,182,198,172,174,176,198,172,180,184,172,172,180,180,168,168,168,168,174,174,176,176]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_5_kernel0":{"total_latency":20, "latency":[18,18,22,22,26,28,30,32]},"fused_nn_conv2d_add_nn_relu_13_kernel0":{"total_latency":40, "latency":[38,40,44,46,52,58]},"fused_nn_conv2d_add_nn_relu_14_kernel0":{"total_latency":36, "latency":[36,38,42]},"fused_nn_conv2d_add_nn_relu_21_kernel0":{"total_latency":96, "latency":[80,100,118]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel1":{"total_latency":48, "latency":[20,22,24,26,28]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":464, "latency":[328,376,430,428,476,552,652,730]},"fused_nn_max_pool2d_6_kernel0":{"total_latency":32, "latency":[14,14,16,16,16,18,18,18]},"fused_nn_avg_pool2d_7_kernel0":{"total_latency":24, "latency":[16,16,18,16,16,16,18,18]},"fused_nn_conv2d_add_nn_relu_15_kernel0":{"total_latency":84, "latency":[80,98,130]},"fused_concatenate_9_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]}}}


--------------------------------------------------------------------------------
/resource/mobilenet/mobilenet.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":1188,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":24, "latency":[16,16,16,16,16,16,18,18,18,18,18,18,20,20,20,20,20,20,20,22,22,22,22,22,24,20,20,22,22,20,20,20]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":128, "latency":[120,136]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":20, "latency":[16,16,16,18,18,20,20,20]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":72, "latency":[66,70]},"fused_nn_conv2d_add_nn_relu_13_kernel0":{"total_latency":20, "latency":[16,18]},"fused_nn_conv2d_add_nn_relu_14_kernel0":{"total_latency":28, "latency":[20,22,24,26,28,32]},"fused_nn_conv2d_add_nn_relu_16_kernel0":{"total_latency":24, "latency":[16,18,20,22]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":24, "latency":[16,20]},"fused_nn_conv2d_add_nn_relu_18_kernel0":{"total_latency":104, "latency":[56,96]},"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":24, "latency":[22,24,22,22,26,26,24,24,24,24,28,24,28,28,24,24,28,28,26,26,26,30,30,30,30,28,28,28,30,30,30,30]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":80, "latency":[70,74,78,94]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14,14,14]},"fused_nn_conv2d_add_nn_relu_15_kernel0":{"total_latency":28, "latency":[16,20]},"fused_nn_conv2d_add_nn_relu_17_kernel0":{"total_latency":20, "latency":[16,16,16,18]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":36, "latency":[24,26,30,34]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":32, "latency":[32,32]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":20, "latency":[16,16,18,18,18,18,20,20]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,18,20,22]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":20, "latency":[16,16,18,18]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":52, "latency":[50,50]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":44, "latency":[40,46,46,52]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,18,18,18]}}}


--------------------------------------------------------------------------------
/resource/mocked_kernel/mocked_kernel.be.cu:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #define NUM_BLOCKS 64
 4 | #define NUM_TREHAD_PER_BLOCK 128
 5 | #define BLOCKDIM_X 4
 6 | #define BLOCKDIM_Y 8
 7 | #define BLOCKDIM_Z 4
 8 | 
 9 | __device__ void multiply_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){
10 |     __shared__ float buffer[1024 * 32 / 4];
11 |     int blockOffset = blockIdx.x;
12 |     int blockSize = NUM_TREHAD_PER_BLOCK;
13 |     int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y;
14 |     int arrayOffset = blockOffset * blockSize + threadOffset;
15 | 
16 |     temp[arrayOffset] = a[threadOffset] * b[arrayOffset];
17 | }
18 | 
19 | __device__ void add_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){
20 |     __shared__ float buffer[1024 * 32 / 4];
21 |     int blockOffset = blockIdx.x;
22 |     int blockSize = NUM_TREHAD_PER_BLOCK;
23 |     int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y;
24 |     int arrayOffset = blockOffset * blockSize + threadOffset;
25 | 
26 |     temp[arrayOffset] = a[threadOffset] + b[arrayOffset];
27 | }
28 | 
29 | extern "C" __global__ void multiply(int* preempted, int* task_slot, float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) {
30 |     if (*preempted) return;
31 |     multiply_device(a, b, temp);
32 |     if (threadIdx.x + threadIdx.y + threadIdx.z == 0)
33 |         atomicAdd(task_slot, 1);
34 | }        
35 | 
36 | extern "C" __global__ void add(int* preempted, int* task_slot, float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) {
37 |     if (*preempted) return;
38 |     add_device(a, b, temp);
39 |     if (threadIdx.x + threadIdx.y + threadIdx.z == 0)
40 |         atomicAdd(task_slot, 1);
41 | }        
42 | 


--------------------------------------------------------------------------------
/resource/mocked_kernel/mocked_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #define NUM_BLOCKS 64
 4 | #define NUM_TREHAD_PER_BLOCK 128
 5 | #define BLOCKDIM_X 4
 6 | #define BLOCKDIM_Y 8
 7 | #define BLOCKDIM_Z 4
 8 | 
 9 | extern "C" __global__ void multiply(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) {
10 |     __shared__ float buffer[1024 * 32 / 4];
11 |     int blockOffset = blockIdx.x;
12 |     int blockSize = NUM_TREHAD_PER_BLOCK;
13 |     int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y;
14 |     int arrayOffset = blockOffset * blockSize + threadOffset;
15 | 
16 |     temp[arrayOffset] = a[threadOffset] * b[arrayOffset];
17 | }
18 | 
19 | extern "C" __global__ void add(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) {
20 |     __shared__ float buffer[1024 * 32 / 4];
21 |     int blockOffset = blockIdx.x;
22 |     int blockSize = NUM_TREHAD_PER_BLOCK;
23 |     int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y;
24 |     int arrayOffset = blockOffset * blockSize + threadOffset;
25 | 
26 |     temp[arrayOffset] = a[threadOffset] + b[arrayOffset];
27 | }
28 | 


--------------------------------------------------------------------------------
/resource/mocked_kernel/mocked_kernel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": [
 3 |         {
 4 |             "name": "a",
 5 |             "stype": "float32",
 6 |             "size": 8192
 7 |         },
 8 |         {
 9 |             "name": "b",
10 |             "stype": "float32",
11 |             "size": 8192
12 |         },
13 |         {
14 |             "name": "temp",
15 |             "stype": "float32",
16 |             "size": 8192
17 |         },
18 |         {
19 |             "name": "c",
20 |             "stype": "float32",
21 |             "size": 8192
22 |         },
23 |         {
24 |             "name": "output",
25 |             "stype": "float32",
26 |             "size": 8192
27 |         }
28 |     ],
29 |     "kernels": [
30 |         {
31 |             "name": "add",
32 |             "launch_params": [
33 |                 64,
34 |                 1,
35 |                 1,
36 |                 4,
37 |                 8,
38 |                 4
39 |             ],
40 |             "args": [
41 |                 1,
42 |                 0,
43 |                 2
44 |             ]
45 |         },
46 |         {
47 |             "name": "multiply",
48 |             "launch_params": [
49 |                 64,
50 |                 1,
51 |                 1,
52 |                 4,
53 |                 8,
54 |                 4
55 |             ],
56 |             "args": [
57 |                 2,
58 |                 3,
59 |                 4
60 |             ]
61 |         }
62 |     ],
63 |     "args": [
64 |         0,
65 |         1,
66 |         3
67 |     ],
68 |     "shared_memory": {
69 |         "add": 1024,
70 |         "multiply": 1024
71 |     }
72 | }


--------------------------------------------------------------------------------
/resource/mocked_kernel/mocked_kernel.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":30,"kernel_latency":{"multiply":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]},"add":{"total_latency":16, "latency":[16,14,14,14,14,14,14,14,14,14,14,14,16,14,14,14]}}}


--------------------------------------------------------------------------------
/resource/mocked_kernel/mocked_kernel.trans.cu:
--------------------------------------------------------------------------------
  1 | #include <hip/hip_runtime.h>
  2 | 
  3 | #define NUM_BLOCKS 64
  4 | #define NUM_TREHAD_PER_BLOCK 128
  5 | #define BLOCKDIM_X 4
  6 | #define BLOCKDIM_Y 8
  7 | #define BLOCKDIM_Z 4
  8 | 
  9 | #define CU_NUM 60
 10 | 
 11 | __device__ __forceinline__ bool is_first_thread() {
 12 |   return threadIdx.x == 0;
 13 | }
 14 | 
 15 | __device__ __forceinline__ unsigned int get_cu_id() {
 16 |   return blockIdx.x % CU_NUM;
 17 | }
 18 | 
 19 | __device__ __forceinline__ dim3 get_3d_idx(int idx, dim3 dim) {
 20 |   dim3 result;
 21 |   result.x = idx % dim.x;
 22 |   result.y = idx / dim.x % dim.y;
 23 |   result.z = idx / (dim.x * dim.y);
 24 |   return result;
 25 | }
 26 | 
 27 | __device__ void multiply_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){
 28 |     __shared__ float buffer[1024 * 32 / 4];
 29 |     int blockOffset = blockIdx.x;
 30 |     int blockSize = NUM_TREHAD_PER_BLOCK;
 31 |     int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y;
 32 |     int arrayOffset = blockOffset * blockSize + threadOffset;
 33 | 
 34 |     temp[arrayOffset] = a[threadOffset] * b[arrayOffset];
 35 | }
 36 | 
 37 | __device__ void add_device(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp){
 38 |     __shared__ float buffer[1024 * 32 / 4];
 39 |     int blockOffset = blockIdx.x;
 40 |     int blockSize = NUM_TREHAD_PER_BLOCK;
 41 |     int threadOffset = threadIdx.x + threadIdx.y * BLOCKDIM_X + threadIdx.z * BLOCKDIM_X * BLOCKDIM_Y;
 42 |     int arrayOffset = blockOffset * blockSize + threadOffset;
 43 | 
 44 |     temp[arrayOffset] = a[threadOffset] + b[arrayOffset];
 45 | }
 46 | 
 47 | extern "C" __global__  __attribute__((amdgpu_num_vgpr(25))) __attribute__((amdgpu_num_sgpr(30))) void multiply_device_wrapper(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) {
 48 |     // Force the compiler to use all the index
 49 |     if (threadIdx.x + threadIdx.y * 4 + threadIdx.z * 8 * 4 >= 4 * 8 * 4) return;
 50 |     // if (blockIdx.x + blockIdx.y * 64 + blockIdx.z * 1 * 64 >= 64 * 1 * 1) return;
 51 |     multiply_device((float* __restrict__)a,(float* __restrict__)b,(float* __restrict__)temp);
 52 |     asm volatile(";; end_flag"); // jump back to the caller
 53 | }
 54 | 
 55 | extern "C" __global__  __attribute__((amdgpu_num_vgpr(25))) __attribute__((amdgpu_num_sgpr(30))) void add_device_wrapper(float* __restrict__ a, float* __restrict__ b, float* __restrict__ temp) {
 56 |     // Force the compiler to use all the index
 57 |     if (threadIdx.x + threadIdx.y * 4 + threadIdx.z * 8 * 4 >= 4 * 8 * 4) return;
 58 |     // if (blockIdx.x + blockIdx.y * 64 + blockIdx.z * 1 * 64 >= 64 * 1 * 1) return;
 59 |     add_device((float* __restrict__)a,(float* __restrict__)b,(float* __restrict__)temp);
 60 |     asm volatile(";; end_flag"); // jump back to the caller
 61 | }
 62 | 
 63 | extern "C" __global__  void multiply(
 64 |     void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,
 65 |     void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,
 66 |     int cu_partition) {
 67 |     asm volatile(";; caller_flag");
 68 |     return;
 69 | }
 70 | 
 71 | extern "C" __global__  void add(
 72 |     void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,
 73 |     void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,
 74 |     int cu_partition) {
 75 |     asm volatile(";; caller_flag");
 76 |     return;
 77 | }
 78 | 
 79 | extern "C" __device__ __noinline__ dim3 get_3d_idx_64_1_1(int idx) {
 80 |   dim3 dim(64, 1, 1);
 81 |   dim3 result;
 82 |   result.x = idx % dim.x;
 83 |   result.y = idx / dim.x % dim.y;
 84 |   result.z = idx / (dim.x * dim.y);
 85 |   return result;
 86 | }
 87 | 
 88 | extern "C" __device__ __noinline__ dim3 get_3d_idx_4_8_4(int idx) {
 89 |   dim3 dim(4, 8, 4);
 90 |   dim3 result;
 91 |   result.x = idx % dim.x;
 92 |   result.y = idx / dim.x % dim.y;
 93 |   result.z = idx / (dim.x * dim.y);
 94 |   return result;
 95 | }
 96 | 
 97 | __global__ void get_3d_idx_caller(int* buf) {
 98 |     dim3 task_idx;
 99 | 
100 |     task_idx = get_3d_idx_64_1_1(threadIdx.x);
101 |     buf[task_idx.x] = task_idx.x;
102 |     buf[task_idx.y] = task_idx.y;
103 |     buf[task_idx.z] = task_idx.z;
104 | 
105 |     task_idx = get_3d_idx_4_8_4(threadIdx.x);
106 |     buf[task_idx.x] = task_idx.x;
107 |     buf[task_idx.y] = task_idx.y;
108 |     buf[task_idx.z] = task_idx.z;
109 | 
110 | }
111 | 
112 | #define CALL_FRAMEWORK(idx) \
113 | extern "C" __global__ void call_framework_##idx(\
114 |   void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,\
115 |   void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,\
116 |   int cu_partition) \
117 | {\
118 |   asm volatile(\
119 |     "  s_load_dwordx2 s[14:15], s[4:5], 0x0\n"\
120 |     "  s_waitcnt lgkmcnt(0)\n"\
121 |     "  s_setpc_b64 s[14:15]\n"\
122 |     "  s_endpgm\n"\
123 |   );\
124 | }
125 | 
126 | CALL_FRAMEWORK(1)
127 | CALL_FRAMEWORK(2)
128 | CALL_FRAMEWORK(3)
129 | CALL_FRAMEWORK(4)
130 | CALL_FRAMEWORK(5)
131 | CALL_FRAMEWORK(6)
132 | CALL_FRAMEWORK(7)
133 | CALL_FRAMEWORK(8)
134 | CALL_FRAMEWORK(9)
135 | CALL_FRAMEWORK(10)
136 | 
137 | #define MERGE_FRAMEWORK(idx) \
138 | extern "C" __global__ void merge_framework_##idx(\
139 |   void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,\
140 |   void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,\
141 |   int cu_partition) \
142 | {\
143 |   asm volatile(\
144 |     "  s_load_dword s10, s[4:5], 0x40\n"\
145 |     "  s_load_dwordx2 s[12:13], s[4:5], 0x0\n"\
146 |     "  s_load_dwordx2 s[14:15], s[4:5], 0x20\n"\
147 |     "  s_mul_hi_u32 s11, s6, 0x88888889\n"\
148 |     "  s_lshr_b32 s11, s11, 5\n"\
149 |     "  s_mul_i32 s11, s11, 60\n"\
150 |     "  s_sub_i32 s11, s6, s11\n"\
151 |     "  s_waitcnt lgkmcnt(0)\n"\
152 |     "  s_cmp_ge_u32 s11, s10\n"\
153 |     "  s_mov_b64 s[10:11], -1\n"\
154 |     "  s_cbranch_scc1 MyBB"#idx"_3\n"\
155 |     "; %bb.1:                                ; %Flow\n"\
156 |     "  s_andn2_b64 vcc, exec, s[10:11]\n"\
157 |     "  s_cbranch_vccz MyBB"#idx"_4\n"\
158 |     "  s_endpgm\n"\
159 |     "MyBB"#idx"_3:\n"\
160 |     "  s_setpc_b64 s[14:15]\n"\
161 |     "  s_endpgm\n"\
162 |     "MyBB"#idx"_4:\n"\
163 |     "  s_setpc_b64 s[12:13]\n"\
164 |     "  s_endpgm\n"\
165 |   );\
166 | }
167 | MERGE_FRAMEWORK(1)
168 | MERGE_FRAMEWORK(2)
169 | MERGE_FRAMEWORK(3)
170 | MERGE_FRAMEWORK(4)
171 | MERGE_FRAMEWORK(5)
172 | MERGE_FRAMEWORK(6)
173 | MERGE_FRAMEWORK(7)
174 | MERGE_FRAMEWORK(8)
175 | MERGE_FRAMEWORK(9)
176 | MERGE_FRAMEWORK(10)
177 | MERGE_FRAMEWORK(nostack_1)
178 | MERGE_FRAMEWORK(nostack_2)
179 | MERGE_FRAMEWORK(nostack_3)
180 | MERGE_FRAMEWORK(nostack_4)
181 | MERGE_FRAMEWORK(nostack_5)
182 | MERGE_FRAMEWORK(nostack_6)
183 | MERGE_FRAMEWORK(nostack_7)
184 | MERGE_FRAMEWORK(nostack_8)
185 | MERGE_FRAMEWORK(nostack_9)
186 | MERGE_FRAMEWORK(nostack_10)
187 | 


--------------------------------------------------------------------------------
/resource/resnet/resnet.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":10958,"kernel_latency":{"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_dense_add_kernel0":{"total_latency":56, "latency":[26,26,28,32,32,32,34,38,40,40,42,44,46,50,52,52,58,56,54,56,58,60,60,64,68,56,60,60,58,64,66,64]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":24, "latency":[22,24,22,22,24,24,24,24,24,26,28,28,28,24,28,24,28,24,28,26,26,30,30,28,28,30,30,30,30,30,30,30]},"fused_nn_conv2d_add_multiply_add_nn_relu_kernel0":{"total_latency":84, "latency":[76,76,84,96]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":200, "latency":[150,204]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":52, "latency":[50,56]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,18,18,18,18,18,20,20,20,20,22]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel1":{"total_latency":56, "latency":[34,36,42,46,54,64,70,70]},"fused_nn_conv2d_kernel0":{"total_latency":32, "latency":[18,20,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel2":{"total_latency":16, "latency":[16,16,16,16,18,18,18,18]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":44, "latency":[30,32,34,36,38,40,44,48,54,56,62,64,68,72,76,80]},"fused_nn_conv2d_2_kernel0":{"total_latency":124, "latency":[98,114]},"fused_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,18,18,20,20,22,22,22]},"fused_nn_conv2d_add_1_kernel0":{"total_latency":60, "latency":[50,58]},"fused_nn_conv2d_add_add_nn_relu_kernel0":{"total_latency":88, "latency":[50,86]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,20,22,24,26,28,30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel1":{"total_latency":28, "latency":[18,20,22,24]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,14,16,16,18,18,18]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":24, "latency":[18,18,18,20,20,20,22,22,24,26,26,26,28,28,30,30]},"fused_nn_conv2d_add_2_kernel0":{"total_latency":40, "latency":[36,40,48,46]},"fused_nn_conv2d_add_3_kernel0":{"total_latency":36, "latency":[20,24,26,28,32,32,34]},"fused_add_14_kernel0":{"total_latency":16, "latency":[18,14,14,14,14,14,16,14]},"fused_nn_conv2d_add_add_nn_relu_1_kernel0":{"total_latency":40, "latency":[36,40,48]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":104, "latency":[102,102,110,116]},"fused_nn_conv2d_1_kernel0":{"total_latency":100, "latency":[78,98]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":136, "latency":[100,130,180,236,358,416,454,478]},"fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel1":{"total_latency":48, "latency":[36,52]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":64, "latency":[62,68,72,90]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":80, "latency":[60,78,78,90]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,16,16,16,16,16,16]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,16,18,18,20,18,18,20,22,22,20,22,22,24,24]},"fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel2":{"total_latency":16, "latency":[14,16,16,16,16,16,16,16,16,16,16,16,18,18]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":500, "latency":[192,356]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel2":{"total_latency":16, "latency":[14,14,14,14,14,16,16,16,16,16,16,16,16,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18,18,18,18,18,20,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel1":{"total_latency":68, "latency":[62,66,70,74,80,88,102,116]},"fused_nn_conv2d_3_kernel0":{"total_latency":408, "latency":[240,254]},"fused_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_add_nn_relu_2_kernel0":{"total_latency":40, "latency":[20,22,24,28,30,32,36]},"fused_nn_conv2d_add_kernel0":{"total_latency":84, "latency":[76,76,82,96]}}}


--------------------------------------------------------------------------------
/resource/resnet152/resnet152.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":13233,"kernel_latency":{"fused_nn_dense_add_kernel0":{"total_latency":56, "latency":[26,26,26,28,28,30,32,34,34,36,38,40,40,44,46,48,52,54,56,60,60,62,60,62,66,58,56,56,58,64,74,64]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[12,12,14,14,14,14,14,14]},"fused_nn_conv2d_add_kernel0":{"total_latency":60, "latency":[58,70,78,84,94,94,104,110,100,104,100,98,102,100,100,102]},"fused_nn_conv2d_3_kernel0":{"total_latency":136, "latency":[108,120,132,142,150,156,162,168,168,164,168,168,170,178,176,182,180,184,184,188,194,198,204,212,220,226,236,246]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":240, "latency":[240,240,256,262,282,310,310,306,308,310,310,312,310,310,314,310]},"fused_nn_softmax_kernel0":{"total_latency":24, "latency":[22]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,16,16,16,16,16,16,16,16]},"fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":84, "latency":[82,84,90,96,106,114,130,148]},"fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":76, "latency":[74,78,86,94,106,114,126,134]},"fused_nn_conv2d_kernel0":{"total_latency":24, "latency":[22,24,26,30]},"fused_nn_conv2d_add_add_nn_relu_2_kernel0":{"total_latency":24, "latency":[22,24,28,30,32]},"fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":204, "latency":[200,282,362,470,584,698,700,700]},"fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":84, "latency":[56,88,122,156,192,228,228]},"fused_nn_conv2d_add_3_kernel0":{"total_latency":20, "latency":[20,22,26,28]},"fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":24, "latency":[22,24,26,28]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,16,16,16,16]},"fused_nn_conv2d_add_multiply_add_nn_relu_kernel0":{"total_latency":60, "latency":[58,70,78,86,92,104,106,108,102,102,102,102,102,100,98,100]},"fused_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_conv2d_add_add_nn_relu_kernel0":{"total_latency":36, "latency":[34,34,34,36,36,40,44,46]},"fused_nn_conv2d_add_1_kernel0":{"total_latency":36, "latency":[34,34,34,36,36,38,40,44]},"fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":36, "latency":[36,36,36,36]},"fused_nn_conv2d_2_kernel0":{"total_latency":88, "latency":[88,80,86,92,106,116,128,138]},"fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":100, "latency":[100,160,230,300,370,440,440]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":160, "latency":[154,166,162,170,166,164,158,168,156,162,156,162,170,176,186,188,206,210,228,234,262,266,282,298,314,316,324,340,356,358,366,376]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":116, "latency":[116,116,118,120,124,128,130,134,138,142,146,148,154,156,162,166,174,178,184,190,196,202,208,212]},"fused_nn_conv2d_add_add_nn_relu_1_kernel0":{"total_latency":24, "latency":[18,20,22,22,22,22,22,22]},"fused_nn_conv2d_add_2_kernel0":{"total_latency":24, "latency":[18,20,20,22,22,22,20,20]},"fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":36, "latency":[36,48,58,72,86]},"fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":52, "latency":[50,54,64,74]},"fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":32, "latency":[32,36,42,42,42,40,40,40]},"fused_add_14_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[12,14,14,14,14,14,14,16]},"fused_nn_conv2d_1_kernel0":{"total_latency":12, "latency":[10,10,12,12]},"fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]}}}


--------------------------------------------------------------------------------
/resource/resnet18/resnet18.param:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SJTU-IPADS/reef/58dabe0a63fe6979349b358a78aa324cca050e4a/resource/resnet18/resnet18.param


--------------------------------------------------------------------------------
/resource/resnet18/resnet18.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":2189,"kernel_latency":{"fused_nn_softmax_kernel0":{"total_latency":28, "latency":[28]},"fused_nn_dense_add_kernel0":{"total_latency":16, "latency":[14,14,16,14,14,14,14,14,16,18,16,16,16,16,16,16,18,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16]},"fused_nn_global_avg_pool2d_kernel0":{"total_latency":24, "latency":[22,22,26,22,24,22,28,24,28,22,28,28,24,24,26,26,28,28,30,30,30,26,26,28,28,28,28,28,30,30,30,30]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_multiply_add_nn_re_11882905421691233276__kernel2":{"total_latency":16, "latency":[14,16,16,16,16,16,16,16,16,16,18,18,18,18,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel2":{"total_latency":16, "latency":[14,14,14,14,16,14,16,16,16,16,16,16,16,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel1":{"total_latency":68, "latency":[62,64,70,74,80,88,102,118]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_multiply_add_nn_re_11882905421691233276__kernel1":{"total_latency":68, "latency":[62,64,70,76,80,88,102,116]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_kernel0":{"total_latency":16, "latency":[16,14,16,16,16,16,18,18,18,18,20,18,20,20,20,20]},"fused_nn_conv2d_3_kernel0":{"total_latency":48, "latency":[44,46,56,64,62,72,68,76,76,80,80,80,78,78,78,78,78,78,78,78]},"fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":332, "latency":[250,350,402,530,650,768,884,998]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_kernel2":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16,16,18,18,18,18,18]},"fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":176, "latency":[174,310]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_kernel1":{"total_latency":68, "latency":[62,66,70,74,80,88,102,116]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2_kernel1":{"total_latency":28, "latency":[18,20,22,24]},"fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":500, "latency":[188,356]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_3_kernel1":{"total_latency":28, "latency":[18,20,24,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_kernel1":{"total_latency":72, "latency":[34,38,42,46,54,64,70,70]},"fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":180, "latency":[176,284,408]},"fused_nn_conv2d_1_kernel0":{"total_latency":76, "latency":[52,74]},"fused_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,16]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_3_kernel0":{"total_latency":24, "latency":[18,22,22,24,26,32,32,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_2_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[16,14,16,16,16,18,18,18,18,18,18,20,20,20,20,22]},"fused_nn_batch_flatten_kernel0":{"total_latency":16, "latency":[14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel1":{"total_latency":56, "latency":[34,36,42,46,54,66,70,70]},"fused_nn_conv2d_kernel0":{"total_latency":24, "latency":[16,18,18,18,20,20,20,22,24,24,26,26,28,28,28,30]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel2":{"total_latency":16, "latency":[16,16,16,18,18,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1_kernel2":{"total_latency":16, "latency":[14,16,16,16,16,16,16,18,16,16,18,18,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,18,18,18,18,20,20,20,20]},"fused_add_10_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":24, "latency":[14,14,14,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel0":{"total_latency":20, "latency":[18,20,22,24,26,28,30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_3_kernel2":{"total_latency":20, "latency":[16,18,22,18,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3_kernel1":{"total_latency":28, "latency":[18,20,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_2_kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,18,18,18,20,20,20,20,22,22,22]},"fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,14,14,16,16,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_multiply_add_nn_re_11882905421691233276__kernel0":{"total_latency":16, "latency":[14,14,16,16,16,16,16,16,18,18,18,18,20,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1_kernel1":{"total_latency":48, "latency":[38,52]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_2_kernel1":{"total_latency":48, "latency":[36,48]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2_kernel0":{"total_latency":20, "latency":[18,20,22,24,26,28,30,32]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1_kernel2":{"total_latency":16, "latency":[16,14,16,16,18,16,16,18,16,16,16,16,18,18]},"fused_nn_conv2d_2_kernel0":{"total_latency":28, "latency":[26,28,30,30]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel1":{"total_latency":48, "latency":[36,50]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,18,18,20,18,18,20,20,20,22,24,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_kernel2":{"total_latency":16, "latency":[14,14,14,16,16,16,16,18,16,16,18,18,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2_kernel2":{"total_latency":20, "latency":[16,18,18,20,18,20,20,20]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,18,18,18,18,18,20,20,22,20,22]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,16,18,18,18,18,18,20,22,20,20,22]},"fused_add_nn_relu_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1_kernel1":{"total_latency":56, "latency":[34,36,42,48,54,66,70,70]},"fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[14,16,16,16,16,18,18,18,20,20,20,20,22,22,22,24]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2_kernel2":{"total_latency":16, "latency":[14,14,16,16,16,16,16,18,16,16,16,16,18,18]},"fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1_kernel2":{"total_latency":16, "latency":[14,14,16,14,16,16,16,16,16,16,16,16,18,18]}}}


--------------------------------------------------------------------------------
/resource/vgg/vgg.profile.json:
--------------------------------------------------------------------------------
1 | {"model_latency":4824,"kernel_latency":{"tvmgen_default_fused_nn_batch_flatten_kernel0":{"total_latency":14, "latency":[14,14,14,14,14,14,14,14]},"tvmgen_default_fused_nn_global_avg_pool2d_kernel0":{"total_latency":16, "latency":[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,16,16,16,16,16,16,16,16,16,16]},"tvmgen_default_fused_nn_conv2d_add_multiply_add_nn_relu_kernel0":{"total_latency":62, "latency":[62,74,80,88,92,96,110,112,108,110,106,104,104,104,104,104]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_1_kernel0":{"total_latency":164, "latency":[164,170,186,202,186,184,172,188,172,172,164,174,182,186,192,196,204,208,228,236,270,274,292,318,322,344,348,368,370,380,382,406]},"tvmgen_default_fused_add_nn_relu_kernel0":{"total_latency":14, "latency":[14,14,14,14,14,14,14,14]},"tvmgen_default_fused_nn_conv2d_3_kernel0":{"total_latency":140, "latency":[114,132,142,154,160,162,176,188,188,184,184,184,188,194,194,202,194,198,194,198,206,206,212,220,228,232,244,254]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_kernel0":{"total_latency":252, "latency":[254,248,262,270,292,324,324,324,322,322,322,324,326,326,324,330]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_2_kernel0":{"total_latency":120, "latency":[120,120,120,122,126,130,136,138,142,146,150,152,158,160,164,168,176,182,188,194,200,206,210,216]},"tvmgen_default_fused_nn_conv2d_add_add_nn_relu_kernel0":{"total_latency":40, "latency":[38,38,36,38,40,42,44,46]},"tvmgen_default_fused_add_nn_relu_1_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16]},"tvmgen_default_fused_nn_dense_add_kernel0":{"total_latency":60, "latency":[26,26,28,32,32,32,34,36,38,40,42,44,46,50,52,54,60,62,56,58,58,62,62,64,68,56,58,56,58,64,72,64]},"tvmgen_default_fused_nn_conv2d_add_1_kernel0":{"total_latency":40, "latency":[38,38,36,38,40,42,44,46]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_5_kernel0":{"total_latency":80, "latency":[78,82,88,100,110,120,130,142]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_3_kernel0":{"total_latency":204, "latency":[204,288,368,472,586,700,702,700]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_9_kernel0":{"total_latency":88, "latency":[60,90,124,160,194,230,230]},"tvmgen_default_fused_nn_max_pool2d_add_nn_relu_kernel0":{"total_latency":16, "latency":[16,16,22,18,18,18,18,20]},"tvmgen_default_fused_nn_conv2d_2_kernel0":{"total_latency":96, "latency":[92,82,90,98,108,118,130,140]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_11_kernel0":{"total_latency":28, "latency":[26,26,28,32]},"tvmgen_default_fused_add_kernel0":{"total_latency":14, "latency":[14,14,14,14,14,14,14,14]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_12_kernel0":{"total_latency":40, "latency":[36,48,60,76,90]},"tvmgen_default_fused_nn_conv2d_add_add_nn_relu_1_kernel0":{"total_latency":24, "latency":[20,22,24,24,24,24,24,24]},"tvmgen_default_fused_nn_conv2d_kernel0":{"total_latency":24, "latency":[24,28,30,34]},"tvmgen_default_fused_nn_conv2d_add_3_kernel0":{"total_latency":24, "latency":[24,24,28,30]},"tvmgen_default_fused_add_nn_relu_3_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16]},"tvmgen_default_fused_add_nn_relu_2_kernel0":{"total_latency":16, "latency":[16,16,16,16,16,16,16,16]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_8_kernel0":{"total_latency":60, "latency":[60,60,60,58]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_7_kernel0":{"total_latency":36, "latency":[36,42,46,46,46,46,46,46]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_10_kernel0":{"total_latency":56, "latency":[54,58,66,76]},"tvmgen_default_fused_nn_conv2d_add_add_nn_relu_2_kernel0":{"total_latency":28, "latency":[26,26,30,32,36]},"tvmgen_default_fused_nn_softmax_kernel0":{"total_latency":28, "latency":[26]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_6_kernel0":{"total_latency":104, "latency":[104,164,232,302,372,442,442]},"tvmgen_default_fused_nn_conv2d_add_kernel0":{"total_latency":62, "latency":[62,74,80,88,96,106,122,112,106,108,104,106,106,106,104,106]},"tvmgen_default_fused_nn_conv2d_add_nn_relu_4_kernel0":{"total_latency":84, "latency":[84,88,92,100,106,118,136,150]},"tvmgen_default_fused_nn_conv2d_1_kernel0":{"total_latency":144, "latency":[60,104,110,142]},"tvmgen_default_fused_nn_conv2d_add_2_kernel0":{"total_latency":24, "latency":[20,22,24,24,24,24,24,24]}}}


--------------------------------------------------------------------------------
/script/best_effort_kernel.py:
--------------------------------------------------------------------------------
 1 | import sys, json
 2 | from transform_kernel import replace_global_with_device,replace_blockIdx_with_task_idx, find_all_func_params
 3 | from transform_kernel import add_device_function_param,add_global_definition, generate_function_declaration
 4 | 
 5 | 
 6 | def generate_global_wrappers(func_params):
 7 |     result = []
 8 |     for func_name, func_param in func_params.items():
 9 |         params = [
10 |             {"type": "int*", "name": "preempted"},
11 |             {"type": "int*", "name": "task_slot"},
12 |         ]
13 |         params.extend(func_param)
14 |         params_name = []
15 |         params_type_name = []
16 |         for param in func_param:
17 |             params_name.append(param["name"])
18 |         for param in params:
19 |             params_type_name.append(param["type"] + " " + param["name"])
20 |         params_def = ", ".join(params_type_name)
21 |         params_call = ", ".join(params_name)
22 |         func_template = """
23 | extern "C" __global__ void {func_name}({params_def}) {{
24 |     if (*preempted) return;
25 |     {func_name}_device({params_call});
26 |     if (threadIdx.x + threadIdx.y + threadIdx.z == 0)
27 |         atomicAdd(task_slot, 1);
28 | }}        
29 | """.format(
30 |     func_name = func_name, 
31 |     params_def = params_def,
32 |     params_call = params_call
33 | )       
34 |         result.extend(func_template.splitlines(True))
35 |     return result
36 | 
37 | if __name__ == "__main__":
38 | 
39 |     if len(sys.argv) != 2:
40 |         print("Usage: python best_effort_kernel.py input_file.cu")
41 |         exit(0)
42 | 
43 |     f = open(sys.argv[1], "r")
44 |     lines = f.readlines()
45 |     f.close()
46 | 
47 | 
48 |     func_params = find_all_func_params(lines)
49 |     lines = replace_global_with_device(lines)
50 |     lines.extend(generate_global_wrappers(func_params))
51 | 
52 |     output_f_name = sys.argv[1][:sys.argv[1].rfind(".")] + ".be.cu"
53 | 
54 |     f = open(output_f_name, "w")
55 |     f.writelines(lines)
56 |     f.close()
57 |     


--------------------------------------------------------------------------------
/script/estimate_max_throughput.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | import math
 4 | 
 5 | # Steps: 
 6 | # 1. profile the kernel: rocprof --hsa-trace ./raw_executor code_object schedule.json 1
 7 | # 2. delete the first warm up profile result.
 8 | # 3. use this file to estimate the potential throughput improvement.
 9 | 
10 | 
11 | if len(sys.argv) != 2:
12 |     print("Usage: python3 estimate_max_throughput.py data.csv")
13 |     exit(1)
14 | 
15 | num_cu = 80
16 | 
17 | df = pd.read_csv(sys.argv[1])
18 | df = df[["grd", "wgr", "DurationNs"]]
19 | df['blocks'] = df['grd'] // df['wgr']
20 | 
21 | df['cus'] = df['blocks'].map(lambda x: num_cu - math.ceil(x / math.ceil((x / num_cu))))
22 | df['remain'] = df['cus'] * df['DurationNs']
23 | total = df['DurationNs'].sum() * num_cu
24 | remain = df['remain'].sum()
25 | 
26 | print("%f%%" % (remain / total * 100))


--------------------------------------------------------------------------------
/script/estimate_resource_usage.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | import math
 4 | import subprocess
 5 | import json
 6 | 
 7 | 
 8 | if len(sys.argv) < 2:
 9 |     print("Usage: python3 estimate_resource_usage.py model_dir")
10 |     exit(1)
11 | 
12 | for i in range(1, len(sys.argv)):
13 |     model_dir = sys.argv[i]
14 |     if (model_dir.find("Makefile") != -1):
15 |          continue
16 |     _, model_name = subprocess.getstatusoutput("basename " + model_dir)
17 | 
18 |     model_profile = json.loads(open(model_dir + "/" + model_name + ".profile.json", "r").read())
19 |     model_schedule = json.loads(open(model_dir + "/" + model_name + ".json", "r").read())
20 | 
21 |     num_cu = 60
22 | 
23 |     total = 0
24 |     used = 0
25 | 
26 |     for kernel_info in model_schedule["kernels"]:
27 |         blocks = kernel_info["launch_params"][0] * kernel_info["launch_params"][1] * kernel_info["launch_params"][2]
28 |         cus = math.ceil(blocks / math.ceil((blocks / num_cu)))
29 |         latency = model_profile[kernel_info["name"]]["total_latency"]
30 |         total += latency * num_cu
31 |         used += latency * cus
32 | 
33 |     print("%s: %f%%" % (model_name, used / total * 100))


--------------------------------------------------------------------------------
/script/generate_final_schedule.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | 
  4 | ############################################################################ 
  5 | # 
  6 | #  This script is used to combine the two json files generated from TVM
  7 | #  to a final schedule json file which can be used by the dnn_rt_scheduler.   
  8 | #
  9 | #  Usage: python generate_schedule.py raw_schedule_file.json graph_json.json
 10 | #
 11 | #  The first json file(raw_schedule_file.json) is generated by TVM runtime,
 12 | #  which contains the basic kernel schedule and kernel parameters.
 13 | #
 14 | #  The second json file(graph_json.json) is generated by TVM backend(modified version),
 15 | #  which contains host function information and device stroage information.
 16 | #
 17 | ############################################################################ 
 18 | def generate_final_schedule(source_code_lines, schedule_raw, graph):
 19 |     def split_function_declaration(line):
 20 |         parts = line.split("(")
 21 |         parameters_str = parts[1].split(")")[0]
 22 |         left_parts = parts[0].split(" ")
 23 |         name = left_parts[-1]
 24 |         return_type = left_parts[-2]
 25 |         header = " ".join(left_parts[:-2])
 26 |         parameter_str_list = parameters_str.split(", ")
 27 |         parameters = []
 28 |         for param_str in parameter_str_list:
 29 |             parts = param_str.split(" ")
 30 |             param_name = parts[-1]
 31 |             param_type = " ".join(parts[:-1])
 32 |             parameters.append({"name": param_name, "type": param_type})
 33 |         return header, return_type, name, parameters
 34 |     # 1. storage info from graph_json
 35 | 
 36 |     storage_id = graph["attrs"]["storage_id"][1]
 37 |     ## FIXME: a hack here
 38 |     ## to avoid buffer reuse, we replace storage_id to itself.
 39 |     for i in range(len(storage_id)):
 40 |         storage_id[i] = i
 41 |         
 42 |     storage = []
 43 |     for i in range(max(storage_id) + 1):
 44 |         storage.append({"name": "null", "size": 0, "stype": "null"})
 45 | 
 46 |     arg_idx = []
 47 | 
 48 |     for i in range(len(storage_id)):
 49 |         shape = graph["attrs"]["shape"][1][i]
 50 |         t = graph["attrs"]["dltype"][1][i]
 51 |         size = 1
 52 |         for j in shape:
 53 |             size = size * j
 54 |         sid = storage_id[i]
 55 |         if storage[sid]["size"] < size:
 56 |             storage[sid]["size"] = size
 57 |             storage[sid]["stype"] = t
 58 | 
 59 |     for a in graph["arg_nodes"]:
 60 |         sid = storage_id[a]
 61 |         name = graph["nodes"][a]["name"]
 62 |         storage[sid]["name"] = name
 63 |         arg_idx.append(sid)
 64 | 
 65 |     # 2. append dynamic allocated storage
 66 |     temp_storage_begin = len(storage)
 67 |     for temp_arg in schedule_raw["temp_args"]:
 68 |         storage.append({"name": "temp_arg", "size": temp_arg, "stype": "byte"})
 69 | 
 70 |     # 3. remap the kernel args
 71 |     i = 0
 72 |     kernels = []
 73 |     node_row_ptr = graph["node_row_ptr"]
 74 |     for j in range(len(graph["nodes"])):
 75 |         node = graph["nodes"][j]
 76 |         if node["op"] == "null":
 77 |             continue
 78 |         if node["attrs"]["func_name"] == "__nop":
 79 |             continue
 80 | 
 81 |         schedule_func = schedule_raw["funcs"][i]
 82 |         while len(schedule_func["kernels"]) == 0:
 83 |             i = i + 1
 84 |             schedule_func = schedule_raw["funcs"][i]
 85 | 
 86 |         if schedule_func["name"] != node["attrs"]["func_name"]:
 87 |             raise Exception("schedule name != node name, %s != %s" % (schedule_func["name"],node["name"]))
 88 |         # if node["attrs"]["num_outputs"] != "1":
 89 |         #     print(node["attrs"]["num_outputs"])
 90 |         #     raise Exception("node output != 1")
 91 |         host_inputs = []
 92 |         for inp in node["inputs"]:
 93 |             host_inputs.append(node_row_ptr[inp[0]]+inp[1])
 94 |         for idx in range(int(node["attrs"]["num_outputs"])):
 95 |             host_inputs.append(node_row_ptr[j]+idx)
 96 |         for kernel in schedule_func["kernels"]:
 97 |             new_args = []
 98 |             for arg in kernel["args"]:
 99 |                 if arg < 0:
100 |                     new_args.append(temp_storage_begin-arg-1)
101 |                 else:
102 |                     new_args.append(storage_id[host_inputs[arg]])
103 |             kernels.append({"name": kernel["name"], "launch_params": kernel["launch_params"], "args": new_args})
104 |         i = i+1
105 | 
106 |     output_idx = graph["heads"][0][0]
107 |     storage[storage_id[output_idx]]["name"] = "output"
108 | 
109 |     schedule = {
110 |         "storage": storage,
111 |         "kernels": kernels,
112 |         "args": arg_idx
113 |     }
114 | 
115 |     # 4. generate shared memory usage
116 | 
117 |     func_name = ""
118 |     shared_memory = 0
119 | 
120 |     result = {}
121 | 
122 |     for line in source_code_lines:
123 |         if line.find("void") != -1:
124 |             # save old values
125 |             if func_name != "":
126 |                 if shared_memory < 4:
127 |                     shared_memory = 4
128 |                 result[func_name] = shared_memory
129 | 
130 |             _, _, curr_func_name, _ = split_function_declaration(line)
131 |             func_name = curr_func_name
132 |             shared_memory = 4
133 |         if line.find("__shared__") != -1:
134 |             # __shared__ float x[123];
135 |             size = line.split("[")[1].split("]")[0]
136 |             shared_memory = shared_memory + int(size) * 4
137 | 
138 |     if func_name != "":
139 |         if shared_memory < 4:
140 |             shared_memory = 4
141 |         result[func_name] = shared_memory
142 | 
143 |     schedule["shared_memory"] = result
144 |     return schedule
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     if len(sys.argv) != 4:
149 |         print("Useage: source_code raw_scheduler_file graph_json_file ")
150 |         exit(0)
151 | 
152 |     f = open(sys.argv[1], "r")
153 |     source_code_lines = f.readlines()
154 |     f.close()
155 | 
156 |     f = open(sys.argv[2], "r")
157 |     schedule_raw = json.loads(f.read())
158 |     f.close()
159 | 
160 |     f = open(sys.argv[3], "r")
161 |     graph = json.loads(f.read())
162 |     f.close()
163 |     
164 |     schedule = generate_final_schedule(source_code_lines, schedule_raw, graph)
165 |     
166 |     print(json.dumps(schedule, indent = 4))
167 | 
168 | 


--------------------------------------------------------------------------------
/script/generate_register_hint.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | def split_function_declaration(line):
 5 |     parts = line.strip().split("void")
 6 |     header = parts[0]
 7 |     return_type = "void"
 8 |     right_parts = parts[1].split("(")
 9 |     name = right_parts[0]
10 |     parameters_str = right_parts[1].split(")")[0]
11 |     parameter_str_list = parameters_str.split(", ")
12 |     parameters = []
13 |     for param_str in parameter_str_list:
14 |         parts = param_str.split(" ")
15 |         param_name = parts[-1]
16 |         param_type = " ".join(parts[:-1])
17 |         parameters.append({"name": param_name, "type": param_type})
18 |     print(name)
19 |     return header, return_type, name, parameters
20 | 
21 | def generate_function_declaration(return_type, name, params):
22 |     params_str_list = []
23 |     for param in params:
24 |         param_str = param["type"] + " " + param["name"]
25 |         params_str_list.append(param_str)
26 |     return return_type + " " + name + "(" + ", ".join(params_str_list) + ")"
27 | 
28 | 
29 | src = open(sys.argv[1], "r")
30 | lines = src.readlines()
31 | 
32 | schedule = json.loads(open(sys.argv[2], "r").read())
33 | 
34 | new_lines = []
35 | 
36 | kernel_info = {}
37 | 
38 | for kernel in schedule["kernels"]:
39 |     kernel_info[kernel["name"]] = kernel["launch_params"][3] * kernel["launch_params"][4] * kernel["launch_params"][5]
40 | 
41 | for line in lines:
42 |     if line.find("__global__") != -1:
43 |         _, _, func_name, params = split_function_declaration(line.strip())
44 |         func_name = func_name.strip()
45 |         if func_name in kernel_info:
46 |             new_func = line.replace("void", "__attribute__((amdgpu_flat_work_group_size(%d, %d))) void" % (kernel_info[func_name], kernel_info[func_name]))
47 |             new_lines.append(new_func)
48 |             continue
49 |     new_lines.append(line)
50 | 
51 | f = open(sys.argv[1], "w")
52 | f.writelines(new_lines)
53 | f.close()
54 | 


--------------------------------------------------------------------------------
/script/generate_shared_memory_usage.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | ############################################################
 5 | #
 6 | # This script is used to insert shared memory usage into the 
 7 | # model schedule json file.
 8 | #
 9 | # The shared memory usage is extracted from the device source
10 | # code.
11 | ############################################################
12 | 
13 | if len(sys.argv) != 3:
14 |     print("Usage: source_code.cpp schedule_file.json")
15 |     exit(0)
16 | 
17 | 
18 | def split_function_declaration(line):
19 |     parts = line.split("(")
20 |     parameters_str = parts[1].split(")")[0]
21 |     left_parts = parts[0].split(" ")
22 |     name = left_parts[-1]
23 |     return_type = left_parts[-2]
24 |     header = " ".join(left_parts[:-2])
25 |     parameter_str_list = parameters_str.split(", ")
26 |     parameters = []
27 |     for param_str in parameter_str_list:
28 |         parts = param_str.split(" ")
29 |         param_name = parts[-1]
30 |         param_type = " ".join(parts[:-1])
31 |         parameters.append({"name": param_name, "type": param_type})
32 |     return header, return_type, name, parameters
33 | 
34 | 
35 | 
36 | source_code_lines = open(sys.argv[1], "r").readlines()
37 | schedule = json.loads(open(sys.argv[2], "r").read())
38 | 
39 | func_name = ""
40 | shared_memory = 0
41 | 
42 | result = {}
43 | 
44 | for line in source_code_lines:
45 |     if line.find("void") != -1:
46 |         # save old values
47 |         if func_name != "":
48 |             if shared_memory < 4:
49 |                 shared_memory = 4
50 |             result[func_name] = shared_memory
51 | 
52 |         _, _, curr_func_name, _ = split_function_declaration(line)
53 |         func_name = curr_func_name
54 |         shared_memory = 0
55 |     if line.find("__shared__") != -1:
56 |         # __shared__ float x[123];
57 |         size = line.split("[")[1].split("]")[0]
58 |         shared_memory = shared_memory + int(size) * 4
59 | 
60 | if func_name != "":
61 |     if shared_memory < 4:
62 |         shared_memory = 4
63 |     result[func_name] = shared_memory
64 | 
65 | schedule["shared_memory"] = result
66 | 
67 | old_file_name = sys.argv[2].split(".json")[-2]
68 | new_file_name = old_file_name + "_sm.json"
69 | f = open(new_file_name, "w")
70 | f.write(json.dumps(schedule))
71 | f.close()


--------------------------------------------------------------------------------
/script/get_kernel_descriptor.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | f = open(sys.argv[1], "r")
 5 | lines = f.readlines()
 6 | 
 7 | descriptors = {}
 8 | 
 9 | is_descriptor = False
10 | 
11 | for line in lines:
12 |     if line.find(".amdhsa_kernel ") != -1:
13 |         is_descriptor = True
14 |         continue
15 |     if line.find(".end_amdhsa_kernel") != -1:
16 |         is_descriptor = False
17 |         continue
18 |     if is_descriptor == False:
19 |         continue
20 |     parts = line.strip().split(" ")
21 |     key = parts[0]
22 |     value = parts[1]
23 | 
24 |     if key in descriptors:
25 |         values = descriptors[key]
26 |         values.append(value)
27 |         descriptors[key] = list(set(values))
28 |     else:
29 |         values = []
30 |         values.append(value)
31 |         descriptors[key] = values
32 | 
33 | 
34 | print(json.dumps(descriptors, sort_keys=True, indent=4))
35 |     
36 | 
37 |         
38 |     


--------------------------------------------------------------------------------
/script/get_kernel_occupancy.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | if len(sys.argv) != 3:
 4 |     print("Usage: python3 get_kernel_occupancy.py raw.s trans.s")
 5 |     exit(1)
 6 | 
 7 | raw_asm = open(sys.argv[1], "r")
 8 | trans_asm = open(sys.argv[2], "r")
 9 | 
10 | raw_lines = raw_asm.readlines()
11 | trans_lines = trans_asm.readlines()
12 | 
13 | def get_occupancy(lines):
14 |     current_kernel = ""
15 |     occupancy = {}
16 |     stack_size = {}
17 |     for line in lines:
18 |         if line.find(".amdhsa_kernel") != -1:
19 |             kernel_name = line.strip().split(" ")[-1]
20 |             current_kernel = kernel_name
21 |         if line.find("Occupancy") != -1:
22 |             occupancy[current_kernel] = int(line.strip().split(" ")[-1])
23 |         if line.find("ScratchSize") != -1:
24 |             stack_size[current_kernel] = int(line.strip().split(" ")[-1])
25 | 
26 |     return occupancy, stack_size
27 | 
28 | 
29 | raw_occupancy, raw_stack = get_occupancy(raw_lines)
30 | trans_occupancy, trans_stack = get_occupancy(trans_lines)
31 | 
32 | print("Occupancy:")
33 | for kernel, occupancy in raw_occupancy.items():
34 |     if occupancy > trans_occupancy[kernel]:
35 |         print("%s: %d, %d" %(kernel, occupancy, trans_occupancy[kernel]))
36 | 
37 | print("Stack:")
38 | for kernel, stack in raw_stack.items():
39 |     if stack < trans_stack[kernel]:
40 |         print("%s: %d, %d" %(kernel, stack, trans_stack[kernel]))


--------------------------------------------------------------------------------
/script/replace_raw_occupancy.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | if len(sys.argv) != 3:
 5 |     print("Usage: python3 replace_raw_occupancy.py raw_source.cu trans_asm.s")
 6 | 
 7 | def get_occupancy(lines):
 8 |     current_kernel = ""
 9 |     occupancy = {}
10 |     stack_size = {}
11 |     for line in lines:
12 |         if line.find(".amdhsa_kernel") != -1:
13 |             kernel_name = line.strip().split(" ")[-1]
14 |             current_kernel = kernel_name
15 |         if line.find("Occupancy") != -1:
16 |             occupancy[current_kernel] = int(line.strip().split(" ")[-1])
17 |         if line.find("ScratchSize") != -1:
18 |             stack_size[current_kernel] = int(line.strip().split(" ")[-1])
19 | 
20 |     return occupancy, stack_size
21 | 
22 | asm = open(sys.argv[2], "r")
23 | asm_lines = asm.readlines()
24 | asm.close()
25 | 
26 | asm_occupancy, _ = get_occupancy(asm_lines)
27 | 
28 | raw_source = open(sys.argv[1], "r")
29 | raw_lines = raw_source.readlines()
30 | raw_source.close()
31 | 
32 | new_lines = []
33 | 
34 | for line in raw_lines:
35 |     if line.find('__global__') != -1:
36 |         parts = line.split("void")
37 |         right_part = parts[1]
38 |         func_name = right_part.split("(")[0].strip()
39 |         left_part = 'extern "C" __global__ __attribute__((amdgpu_waves_per_eu(%d,%d))) void ' % (asm_occupancy[func_name], asm_occupancy[func_name])
40 |         new_line = left_part + right_part
41 |         new_lines.append(new_line)
42 |         continue
43 |     new_lines.append(line)
44 | 
45 | raw_source = open(sys.argv[1], "w")
46 | raw_source.writelines(new_lines)
47 | raw_source.close()
48 | 
49 | 


--------------------------------------------------------------------------------
/script/replace_register_usage.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | if len(sys.argv) != 2:
  4 |     print("Usage: python3 replace_register_usage.py source.asm")
  5 |     exit(0)
  6 | 
  7 | f = open(sys.argv[1], "r")
  8 | lines = f.readlines()
  9 | 
 10 | # collect max private segment size
 11 | max_private_segment = 0
 12 | for line in lines:
 13 |     if line.find(".amdhsa_private_segment_fixed_size") != -1:
 14 |         parts = line.strip().split(" ")
 15 |         key = parts[0]
 16 |         value = int(parts[1])
 17 |         if value > max_private_segment:
 18 |             max_private_segment = value
 19 | 
 20 | need_private_segment = 0
 21 | if max_private_segment > 0:
 22 |     need_private_segment = 1
 23 | 
 24 | 
 25 | max_sgprs_per_simd = 800
 26 | max_vgprs_per_smid = 16 * 1024
 27 | 
 28 | max_sgprs_per_wave = 102
 29 | max_vgprs_per_wave = 256
 30 | 
 31 | sgpr_block = 8
 32 | 
 33 | sgprs_layers = [
 34 |     0,
 35 |     102,
 36 |     102,
 37 |     102,
 38 |     102,
 39 |     102,
 40 |     102,
 41 |     102,
 42 |     102,
 43 |     88,
 44 |     80
 45 | ]
 46 | 
 47 | vgprs_layers = [
 48 |     0,
 49 |     256,
 50 |     128,
 51 |     84,
 52 |     64,
 53 |     48,
 54 |     40,
 55 |     36,
 56 |     32,
 57 |     28,
 58 |     28
 59 | ]
 60 | 
 61 | 
 62 | 
 63 | def replace_text_segment_param(lines, key, value, values, key_word="_framework_"):
 64 |     new_lines = []
 65 | 
 66 |     current_kernel = ""
 67 | 
 68 |     for line in lines:
 69 |             # replace amdhas_next_free_vgpr
 70 |         if line.find(".amdhsa_kernel") != -1:
 71 |             kernel_name = line.strip().split(" ")[-1]
 72 |             current_kernel = kernel_name
 73 |             new_lines.append(line)
 74 |         elif line.find(key) != -1 and current_kernel.find(key_word) != -1 and current_kernel.split(key_word)[-1].isnumeric():
 75 |             num_layers = int(current_kernel.split(key_word)[-1])
 76 |             new_value= ""
 77 |             if value == None and values == None:
 78 |                 # print("remove %s %s" % (current_kernel, key))
 79 |                 continue
 80 |             if value != None:
 81 |                 new_value = value
 82 |             else:
 83 |                 new_value = values[num_layers]
 84 |             new_line = "		%s %d\n" % (key, new_value)
 85 |             # print("replace %s %s to %d" % (current_kernel, key, new_value))
 86 |             new_lines.append(new_line)
 87 |         else:
 88 |             new_lines.append(line)
 89 |     return new_lines
 90 | 
 91 | def replace_symbol_segment_param(lines, key, value, values, key_word="_framework_"):
 92 |     new_lines = []
 93 | 
 94 |     current_kernel = ""
 95 | 
 96 |     for line in lines:
 97 |             # replace amdhas_next_free_vgpr
 98 |         if line.find(".name:") != -1:
 99 |             kernel_name = line.strip().split(" ")[-1]
100 |             current_kernel = kernel_name
101 |             new_lines.append(line)
102 |         elif line.find(key) != -1 and current_kernel.find(key_word) != -1 and current_kernel.split(key_word)[-1].isnumeric():
103 |             num_layers = int(current_kernel.split(key_word)[-1])
104 |             new_value= ""
105 |             if value != None:
106 |                 new_value = value
107 |             else:
108 |                 new_value = values[num_layers]
109 |             new_line = "    %s     %d\n" % (key, new_value)
110 |             # print("replace %s %s to %d" % (current_kernel, key, new_value))
111 |             new_lines.append(new_line)
112 |         else:
113 |             new_lines.append(line)
114 |     return new_lines    
115 | 
116 | def batch_replace(lines, key_word, private_segment):
117 |     lines = replace_text_segment_param(lines, ".amdhsa_next_free_vgpr", None, vgprs_layers, key_word)
118 |     lines = replace_text_segment_param(lines, ".amdhsa_next_free_sgpr", None, sgprs_layers, key_word)
119 |     if private_segment:
120 |         lines = replace_text_segment_param(lines, ".amdhsa_private_segment_fixed_size", max_private_segment, None, key_word)
121 |         lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_private_segment_wavefront_offset", need_private_segment, None, key_word)
122 |     # lines = replace_text_segment_param(lines, ".amdhsa_user_sgpr_flat_scratch_init", 0, None)
123 |     lines = replace_text_segment_param(lines, ".amdhsa_user_sgpr_dispatch_ptr", 0, None, key_word)
124 |     lines = replace_text_segment_param(lines, ".amdhsa_system_vgpr_workitem_id", 0, None, key_word)
125 |     lines = replace_text_segment_param(lines, ".amdhsa_reserve_flat_scratch", None, None, key_word)
126 |     lines = replace_text_segment_param(lines, ".amdhsa_reserve_vcc", None, None, key_word)
127 | 
128 |     lines = replace_symbol_segment_param(lines, ".vgpr_count:", None, vgprs_layers, key_word)
129 |     lines = replace_symbol_segment_param(lines, ".sgpr_count:", None, sgprs_layers, key_word)
130 |     if private_segment:
131 |         lines = replace_symbol_segment_param(lines, ".private_segment_fixed_size:", max_private_segment, None, key_word)
132 | 
133 |     return lines
134 | 
135 | lines = batch_replace(lines, "merge_framework_", True)
136 | lines = batch_replace(lines, "call_framework_", True)
137 | lines = batch_replace(lines, "merge_framework_nostack_", False)
138 | lines = batch_replace(lines, "proxy_kernel_", True)
139 | lines = batch_replace(lines, "proxy_kernel_nostack_", False)
140 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_y", 1, None, "proxy_kernel_")
141 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_z", 1, None, "proxy_kernel_")
142 | lines = replace_text_segment_param(lines, ".amdhsa_system_vgpr_workitem_id", 2, None, "proxy_kernel_")
143 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_y", 1, None, "proxy_kernel_nostack_")
144 | lines = replace_text_segment_param(lines, ".amdhsa_system_sgpr_workgroup_id_z", 1, None, "proxy_kernel_nostack_")
145 | lines = replace_text_segment_param(lines, ".amdhsa_system_vgpr_workitem_id", 2, None, "proxy_kernel_nostack_")
146 | f.close()
147 | f = open(sys.argv[1], "w")
148 | f.writelines(lines)
149 | f.close()
150 |     # replace vgpr_count


--------------------------------------------------------------------------------
/script/tvm_generate_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from tvm import relay
 4 | from tvm.relay import testing
 5 | import tvm
 6 | from tvm import te
 7 | from tvm.contrib import graph_runtime
 8 | import sys
 9 | import json
10 | 
11 | #####################################################
12 | #
13 | # This is an example of how to generate source code 
14 | # and schedule json from tvm. 
15 | #
16 | #####################################################
17 | 
18 | 
19 | if len(sys.argv) != 5:
20 |     print("Usage: device_source_file_name raw_schedule_file graph_json_file param_file")
21 |     exit(0)
22 | 
23 | source_file = open(sys.argv[1], "w")
24 | raw_schedule_file = open(sys.argv[2], "w")
25 | graph_json_file = open(sys.argv[3], "w")
26 | param_file = open(sys.argv[4], "w+b")
27 | 
28 | batch_size = 1
29 | num_class = 1000
30 | image_shape = (3, 224, 224)
31 | data_shape = (batch_size,) + image_shape
32 | out_shape = (batch_size, num_class)
33 | 
34 | mod, params = relay.testing.resnet.get_workload(
35 |     num_layers=18, batch_size=batch_size, image_shape=image_shape
36 | )
37 | # mod, params = relay.testing.mobilenet.get_workload(
38 | #     batch_size=batch_size, image_shape=image_shape
39 | # )
40 | 
41 | opt_level = 3
42 | target = tvm.target.rocm()
43 | 
44 | with tvm.transform.PassContext(opt_level=opt_level):
45 |     lib = relay.build(mod, target, params=params)
46 | 
47 | ctx = tvm.rocm()
48 | module = graph_runtime.GraphModule(lib["default"](ctx))
49 | 
50 | data = np.ones(data_shape).astype("float32")
51 | data = data * 10
52 | module.set_input("data", data)
53 | 
54 | module.run()
55 | 
56 | source_file.write(lib.get_lib().imported_modules[0].get_source("hip"))
57 | source_file.close()
58 | 
59 | graph_json_file.write(lib.get_json())
60 | graph_json_file.close()
61 | 
62 | raw_schedule_file.write(module.module["get_schedule_json"]())
63 | raw_schedule_file.close()
64 | 
65 | 
66 | def dump_params(params, f):
67 |     import array
68 |     magic = bytes("TVM_MODEL_PARAMS\0", "ascii")
69 |     f.write(magic)
70 |     f.write(array.array('Q',[len(params)]).tobytes())
71 |     for k in params.keys():
72 |         param = array.array('f', params[k].asnumpy().flatten().tolist())
73 |         f.write(bytes(k, "ascii"))
74 |         f.write(bytes("\0", "ascii"))
75 |         f.write(array.array('Q',[len(param)]).tobytes())
76 |         f.write(param.tobytes())
77 | 
78 | dump_params(params, param_file)    
79 | param_file.close()
80 | 
81 | out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()
82 | print(out.flatten()[0:10])


--------------------------------------------------------------------------------
/src/example/rpc_client.cpp:
--------------------------------------------------------------------------------
 1 | #include "reef/client/client.h"
 2 | 
 3 | int main(int argc, char** argv) {
 4 |     if (argc != 4) {
 5 |         std::cerr << "Usage: " << std::string(argv[0]) << " model_dir model_name [real_time]\n";
 6 |         std::cerr << "Example: " << std::string(argv[0]) << " reef/resource/resnet18 resnet18 1\n";
 7 |         return -1;
 8 |     }
 9 | 
10 |     std::string model_dir(argv[1]);
11 |     std::string model_name(argv[2]);
12 |     int real_time = std::atoi(argv[3]);
13 |     
14 | 
15 |     reef::client::REEFClient client(DEFAULT_REEF_ADDR);
16 |     ASSERT(client.init(real_time)); // whether this client send real-time requests?
17 |     
18 |     std::cout << "loading '" << model_name << "' from " << "'"<< model_dir << "'\n";
19 |     auto model = client.load_model(model_dir, model_name);
20 |     ASSERT(model.get() != nullptr);
21 | 
22 |     // Get or set the input/output data.
23 |     // auto input_blob = model->get_input_blob();
24 |     // model->load_input();
25 |     // auto output_blob = model->get_output_blob();
26 |     // auto output = model->get_output();
27 | 
28 |     std::cout << "submit inference request\n";
29 |     
30 |     auto task = model->infer(); // submit an inference request
31 |     std::cout << "inference latency: " << std::chrono::duration_cast<std::chrono::microseconds>(task.finish - task.submit).count() / 1000.0 << " ms\n";
32 | 
33 |     return 0;
34 | }


--------------------------------------------------------------------------------
/src/example/rpc_client_cont.cpp:
--------------------------------------------------------------------------------
 1 | #include "reef/client/client.h"
 2 | #include <thread>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |     if (argc != 5) {
 6 |         std::cerr << "Usage: " << std::string(argv[0]) << " model_dir model_name real_time sleep_time(ms)\n";
 7 |         std::cerr << "Example: " << std::string(argv[0]) << " reef/resource/resnet18 resnet18 1 10\n";
 8 |         return -1;
 9 |     }
10 | 
11 |     std::string model_dir(argv[1]);
12 |     std::string model_name(argv[2]);
13 |     int real_time = std::atoi(argv[3]);
14 |     int sleep_time = std::atoi(argv[4]);
15 |     
16 | 
17 |     reef::client::REEFClient client(DEFAULT_REEF_ADDR);
18 |     ASSERT(client.init(real_time)); // whether this client send real-time requests?
19 |     
20 |     std::cout << "loading '" << model_name << "' from " << "'"<< model_dir << "'\n";
21 |     auto model = client.load_model(model_dir, model_name);
22 |     ASSERT(model.get() != nullptr);
23 | 
24 |     // Get or set the input/output data.
25 |     // auto input_blob = model->get_input_blob();
26 |     // model->load_input();
27 |     // auto output_blob = model->get_output_blob();
28 |     // auto output = model->get_output();
29 | 
30 |     std::cout << "submit inference requests\n";
31 |     while (true) {
32 |         auto task = model->infer(); // submit an inference request
33 |         std::cout << "client " << model->get_mid() << " inference latency: " << std::chrono::duration_cast<std::chrono::microseconds>(task.finish - task.submit).count() / 1000.0 << " ms\n";
34 |         std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time));
35 |     }
36 | 
37 |     return 0;
38 | }


--------------------------------------------------------------------------------
/src/example/rpc_server.cpp:
--------------------------------------------------------------------------------
1 | #include "reef/server/server.h"
2 | 
3 | int main(int argc, char** argv) {
4 |     reef::server::REEFServer server(DEFAULT_REEF_ADDR);
5 |     server.run();
6 |     server.wait();
7 |     return 0;
8 | }


--------------------------------------------------------------------------------
/src/reef/client/client.cpp:
--------------------------------------------------------------------------------
  1 | #include "reef/client/client.h"
  2 | 
  3 | #include <grpcpp/grpcpp.h>
  4 | 
  5 | namespace reef {
  6 | namespace client {
  7 | 
  8 | REEFClient::REEFClient(const std::string &server_addr) : rpc_client(nullptr) {
  9 |     LOG(INFO) << "Create REEFClient to " << server_addr;
 10 | 
 11 |     rpc_client = REEFService::NewStub(
 12 |                     grpc::CreateChannel(server_addr, grpc::InsecureChannelCredentials())
 13 |                  );
 14 | 
 15 |     ASSERT_MSG(rpc_client.get() != nullptr, "cannot create rpc client");
 16 |     LOG(INFO) << "Create REEFClient succeeds";
 17 | }
 18 | 
 19 | bool REEFClient::init(bool real_time) {
 20 |     // set client (task queue) priority
 21 |     grpc::ClientContext ctx;
 22 |     reef::rpc::SetPriorityRequest request;
 23 |     reef::rpc::SetPriorityReply reply;
 24 |     request.set_rt(real_time);
 25 |     auto status = rpc_client->SetPriority(&ctx, request, &reply);
 26 |     ASSERT_MSG(status.ok(), status.error_message());
 27 |     ASSERT(reply.succ());
 28 |     qid = reply.qid();
 29 |     return true;
 30 | }
 31 | 
 32 | std::shared_ptr<ModelHandle> REEFClient::load_model(
 33 |     const std::string& model_dir,
 34 |     const std::string& name
 35 | ) {
 36 |     grpc::ClientContext ctx;
 37 |     reef::rpc::LoadModelRequest request;
 38 |     reef::rpc::LoadModelReply reply;   
 39 |     LOG(INFO) << "Loading model " << name;
 40 |     request.set_dir(model_dir);
 41 |     request.set_name(name);
 42 |     request.set_qid(qid);
 43 |     auto status = rpc_client->LoadModel(&ctx, request, &reply);
 44 |     ASSERT_MSG(status.ok(), status.error_message());
 45 |     ASSERT(reply.succ());    
 46 |     std::shared_ptr<ModelHandle> model = 
 47 |         std::make_shared<ModelHandle>(
 48 |             rpc_client, reply.mid(), model_dir, name
 49 |         );
 50 |     {
 51 |         std::unique_lock<std::mutex> lock(models_mtx);
 52 |         models.push_back(model);
 53 |     }
 54 |     return model;
 55 | }
 56 | 
 57 | ModelHandle::ModelHandle(
 58 |     const std::shared_ptr<REEFService::Stub>& _rpc_client,
 59 |     int32_t _mid,
 60 |     const std::string& _dir, 
 61 |     const std::string& _name
 62 | ) : rpc_client(_rpc_client), mid(_mid), dir(_dir), name(_name) {
 63 | 
 64 | }
 65 | 
 66 | // submit an inference task. wait for completion.
 67 | TaskHandle ModelHandle::infer() {
 68 |     grpc::ClientContext ctx;
 69 |     reef::rpc::InferRequest request;
 70 |     reef::rpc::InferReply reply; 
 71 |     request.set_mid(mid);
 72 |     TaskHandle t;
 73 |     t.submit = std::chrono::system_clock::now();
 74 |     auto status = rpc_client->Infer(&ctx, request, &reply);
 75 |     ASSERT_MSG(status.ok(), status.error_message());
 76 |     ASSERT(reply.succ());
 77 |     t.finish = std::chrono::system_clock::now();
 78 |     t.tid = reply.tid();    
 79 |     return t;
 80 | }
 81 | 
 82 | // submit an asynchronous inference task.
 83 | TaskHandle ModelHandle::infer_async() {
 84 |     return TaskHandle();
 85 | }
 86 | 
 87 | // get the poniter of input shared memory.
 88 | std::shared_ptr<util::SharedMemory> ModelHandle::get_input_blob(const std::string& name) {
 89 |     if (input_blob.get() == nullptr) 
 90 |         input_blob = register_blob(name, input_blob_key);
 91 |     return input_blob;
 92 | }
 93 | 
 94 | // get the poniter of output shared memory.
 95 | std::shared_ptr<util::SharedMemory> ModelHandle::get_output_blob(const std::string& name) {
 96 |     if (output_blob.get() == nullptr) {
 97 |         output_blob = register_blob(name, output_blob_key);
 98 |     }
 99 |     return output_blob;
100 | }
101 | 
102 | std::shared_ptr<util::SharedMemory> ModelHandle::register_blob(const std::string& name, std::string& key) {
103 |     grpc::ClientContext ctx;
104 |     reef::rpc::RegisterBlobRequest request;
105 |     reef::rpc::RegisterBlobReply reply; 
106 | 
107 |     request.set_mid(mid);
108 |     request.set_name(name);
109 |     auto status = rpc_client->RegisterBlob(&ctx, request, &reply);
110 |     ASSERT_MSG(status.ok(), status.error_message());
111 |     ASSERT(reply.succ());    
112 |     
113 |     std::shared_ptr<util::SharedMemory> shm =
114 |         std::make_shared<util::SharedMemory>(reply.key(), reply.size());
115 |     key = reply.key();
116 |     return shm;
117 | }
118 | 
119 | // load model input in REEF server. wait for completion.
120 | void ModelHandle::load_input() {
121 |     ASSERT(input_blob.get() != nullptr);
122 |     grpc::ClientContext ctx;
123 |     reef::rpc::SetBlobRequest request;
124 |     reef::rpc::SetBlobReply reply; 
125 |     request.set_key(input_blob_key);
126 |     auto status = rpc_client->SetBlob(&ctx, request, &reply);
127 |     ASSERT_MSG(status.ok(), status.error_message());
128 |     ASSERT(reply.succ());    
129 | }
130 | 
131 | // load model output in REEF server. wait for completion.
132 | void ModelHandle::get_output() {
133 |     ASSERT(output_blob.get() != nullptr);
134 |     grpc::ClientContext ctx;
135 |     reef::rpc::GetBlobRequest request;
136 |     reef::rpc::GetBlobReply reply; 
137 |     request.set_key(output_blob_key);
138 |     auto status = rpc_client->GetBlob(&ctx, request, &reply);
139 |     ASSERT_MSG(status.ok(), status.error_message());
140 |     ASSERT(reply.succ());    
141 | }
142 | 
143 | int32_t ModelHandle::get_mid() const {
144 |     return this->mid;
145 | }
146 | 
147 | } // namespace client
148 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/client/client.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "reef/rpc/reef.grpc.pb.h"
 4 | #include "reef/util/common.h"
 5 | #include "reef/util/shared_memory.h"
 6 | 
 7 | #include <glog/logging.h>
 8 | 
 9 | #include <string>
10 | #include <memory>
11 | #include <vector>
12 | 
13 | namespace reef {
14 | namespace client {
15 | 
16 | using reef::rpc::REEFService;
17 | 
18 | class TaskHandle {
19 | public:
20 |     int32_t tid;
21 |     std::chrono::system_clock::time_point submit, finish;
22 | };
23 | 
24 | // ModelHandle can be used to submit inference task.
25 | class ModelHandle {
26 | public:
27 |     ModelHandle(
28 |         const std::shared_ptr<REEFService::Stub>& rpc_client,
29 |         int32_t _mid, 
30 |         const std::string& dir,
31 |         const std::string& name
32 |     );
33 |     // submit an inference task. wait for completion.
34 |     TaskHandle infer();
35 | 
36 |     // submit an asynchronous inference task.
37 |     TaskHandle infer_async();
38 | 
39 |     // get the poniter of input shared memory.
40 |     std::shared_ptr<util::SharedMemory> get_input_blob(const std::string& name = "data");
41 | 
42 |     // get the poniter of output shared memory.
43 |     std::shared_ptr<util::SharedMemory> get_output_blob(const std::string& name = "output");
44 | 
45 |     // load model input in REEF server. wait for completion.
46 |     void load_input();
47 | 
48 |     // load model output in REEF server. wait for completion.
49 |     void get_output();
50 | 
51 |     // TODO: FIXME
52 |     void get_blob();
53 |     void set_blob();
54 | 
55 |     int32_t get_mid() const;
56 | private:
57 |     std::shared_ptr<REEFService::Stub> rpc_client;
58 |     int32_t mid;
59 |     std::string dir;
60 |     std::string name;
61 |     std::string input_blob_key, output_blob_key;
62 |     std::shared_ptr<util::SharedMemory> input_blob;
63 |     std::shared_ptr<util::SharedMemory> output_blob;
64 | 
65 | private:
66 |     std::shared_ptr<util::SharedMemory> register_blob(const std::string& name, std::string& key);
67 |     
68 | };
69 | 
70 | 
71 | // REEFClient is used to estabilish connection to REEF server
72 | // and load models into the server.
73 | class REEFClient {
74 | public:
75 |     REEFClient(const std::string &server_addr);
76 |     // initialize the client
77 |     // Each client should be configured with a priority.
78 |     // The real-time clients will share a RT task queue.
79 |     // Each best-effort client will have its own BE task queue.
80 |     bool init(bool real_time = false);
81 | 
82 |     // load a DNN model (in REEF server).
83 |     std::shared_ptr<ModelHandle> load_model(
84 |         const std::string& model_dir,
85 |         const std::string& name
86 |     );
87 | 
88 | private:
89 |     std::shared_ptr<REEFService::Stub> rpc_client;
90 |     std::mutex models_mtx;
91 |     std::vector<std::shared_ptr<ModelHandle>> models;
92 |     int32_t qid;
93 | };
94 | 
95 | 
96 | } // namespace client
97 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/executor_base.cpp:
--------------------------------------------------------------------------------
  1 | #include "reef/executor/executor_base.h"
  2 | #include <glog/logging.h>
  3 | 
  4 | namespace reef {
  5 | namespace executor{
  6 | 
  7 | ExecutorBase::ExecutorBase() {
  8 | 
  9 | }
 10 | 
 11 | ExecutorBase::~ExecutorBase() {
 12 |     // TODO: free GPU memory
 13 | }
 14 | 
 15 | Status ExecutorBase::load_model_from_file(
 16 |     const char* json_file_path, 
 17 |     const char* co_file_path)
 18 | {
 19 |     GPU_RETURN_STATUS(GPUInit(0));
 20 |     // CUcontext ctx;
 21 |     GPUDevice_t device;
 22 |     GPU_RETURN_STATUS(GPUDeviceGet(&device, 0));
 23 |     // GPU_RETURN_STATUS(cuCtxCreate(&ctx, 0, device));
 24 |     GPUModule_t mod;
 25 |     GPU_RETURN_STATUS(GPUModuleLoad(&mod, co_file_path));
 26 |     return this->load_model_from_GPU_module(json_file_path, mod);
 27 | }
 28 | 
 29 | Status ExecutorBase::load_model_from_GPU_module(
 30 |     const char* json_file_path, GPUModule_t mod) {
 31 |     return init_executor_base(json_file_path, mod);
 32 | }
 33 | 
 34 | 
 35 | Status ExecutorBase::init_executor_base(
 36 |     const char* json_file_path,
 37 |     GPUModule_t mod)
 38 | {
 39 |     base_mod = mod;
 40 | 
 41 |     // 1. load json model file
 42 |     model.reset(Model::from_json(json_file_path));
 43 |     if (model.get() == nullptr) RETURN_STATUS(Status::NotFound);
 44 | 
 45 |     // 2. load hip kernels
 46 |     for (KernelInfo &kernel_info : model->kernels) {
 47 |         GPUFunction_t kernel;
 48 |         GPU_RETURN_STATUS(
 49 |             GPUModuleGetFunction(&kernel, mod, kernel_info.name.c_str())
 50 |         );
 51 |         kernels.emplace(kernel_info.name, kernel);
 52 |     }
 53 | 
 54 |     // 3. allocate device storage
 55 |     for (StorageInfo &storage_info : model->storage) {
 56 |         size_t stype_size = Model::get_stype_size(storage_info.stype);
 57 |         size_t storage_size = stype_size * storage_info.size;
 58 |         GPUDevicePtr_t device_ptr;
 59 |         std::vector<char> temp;
 60 |         temp.resize(storage_size, 0);
 61 |         GPU_RETURN_STATUS(GPUMalloc((GPUDevicePtr_t*)&device_ptr, storage_size));
 62 |         GPU_RETURN_STATUS(GPUMemcpyHtoD(device_ptr, temp.data(), storage_size));
 63 |         storage.push_back(device_ptr);
 64 |     }
 65 | 
 66 |     // 4. map args to storage
 67 |     raw_args.reserve(model->kernels.size());
 68 |     for (KernelInfo &kernel_info : model->kernels) {
 69 |         std::vector<GPUDevicePtr_t*> kernel_arg;
 70 |         for (size_t arg_idx : kernel_info.args) {
 71 |             // assert(arg_idx < storage.size());
 72 |             kernel_arg.push_back(&storage[arg_idx]);
 73 |         }
 74 |         raw_args.push_back(kernel_arg);
 75 |     }
 76 | 
 77 |     LOG(INFO) << "create base model stream";
 78 |     GPU_RETURN_STATUS(hipStreamCreateWithWindowSize(&s, 16));
 79 |     return Status::Succ;
 80 | }
 81 | 
 82 | Status ExecutorBase::load_param_from_file(
 83 |     const char* param_file_path) {
 84 |     std::unique_ptr<ModelParam> params(ModelParamParser::parse_from_file(param_file_path));
 85 |     for (size_t i = 0; i < storage.size(); i++) {
 86 |         StorageInfo& storage_info = this->model->storage[i];
 87 |         if (params->find(storage_info.name) == params->end()) 
 88 |             continue;
 89 |         auto &array = params->at(storage_info.name);
 90 |         GPU_RETURN_STATUS(GPUMemcpyHtoD(
 91 |             (GPUDevicePtr_t)storage[i], array.data(), 
 92 |             array.size() * sizeof(float))); 
 93 |     }
 94 |     return Status::Succ;
 95 | }
 96 | 
 97 | Status ExecutorBase::get_data_size(const std::string& key, size_t &size) {
 98 |     size_t input_storage_idx;
 99 |     if (find_storage_idx(key, input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound);   
100 |     StorageInfo& storage_info = this->model->storage[input_storage_idx];
101 |     size = Model::get_stype_size(storage_info.stype) * storage_info.size;
102 |     return Status::Succ;
103 | }
104 | 
105 | Status ExecutorBase::set_input(
106 |     const std::string& key, const std::vector<float>& value) {
107 |     return set_input(key, (void*)value.data(), value.size() * sizeof(float));
108 | }
109 | 
110 | Status ExecutorBase::set_input(const std::string& key, const void* value, size_t len) {
111 |     size_t input_storage_idx;
112 |     if (find_storage_idx(key, input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound);
113 |     StorageInfo& storage_info = this->model->storage[input_storage_idx];
114 |     size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size;
115 |     if (len < storage_size) RETURN_STATUS(Status::OutOfRange);
116 |     GPU_RETURN_STATUS(GPUMemcpyHtoD(
117 |         (GPUDevicePtr_t)this->storage[input_storage_idx], (void*)value, 
118 |         storage_size)
119 |     );
120 |     return Status::Succ;
121 | }
122 | 
123 | Status ExecutorBase::set_input(int idx, const void* value, size_t len) {
124 |     if (idx >= storage.size()) RETURN_STATUS(Status::OutOfRange);
125 |     StorageInfo& storage_info = this->model->storage[idx];
126 |     size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size;
127 |     if (len < storage_size) RETURN_STATUS(Status::OutOfRange);
128 |     GPU_RETURN_STATUS(GPUMemcpyHtoD(
129 |         (GPUDevicePtr_t)this->storage[idx], (void*)value, 
130 |         storage_size)
131 |     );
132 |     return Status::Succ;
133 | }
134 | 
135 | Status ExecutorBase::get_output(std::vector<float>& out) {
136 |     size_t input_storage_idx;
137 |     if (find_storage_idx("output", input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound);
138 |     StorageInfo& storage_info = this->model->storage[input_storage_idx];
139 |     if (Model::get_stype_size(storage_info.stype) != sizeof(float)) RETURN_STATUS(Status::Fail);
140 |     out.resize(storage_info.size);
141 |     return get_data(input_storage_idx, (void*)out.data(), storage_info.size * sizeof(float));
142 | }
143 | 
144 | Status ExecutorBase::get_output(void* out, size_t len) {
145 |     size_t input_storage_idx;
146 |     if (find_storage_idx("output", input_storage_idx) != Status::Succ) RETURN_STATUS(Status::NotFound);
147 |     StorageInfo& storage_info = this->model->storage[input_storage_idx];
148 |     size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size;
149 |     if (len < storage_size) RETURN_STATUS(Status::Fail);
150 |     return get_data(input_storage_idx, out, len);
151 | }
152 | 
153 | Status ExecutorBase::get_data(int idx, void* out, size_t len) {
154 |     if (idx >= this->storage.size()) RETURN_STATUS(Status::OutOfRange);
155 |     StorageInfo& storage_info = this->model->storage[idx];
156 |     size_t storage_size = Model::get_stype_size(storage_info.stype) * storage_info.size;
157 |     if (len < storage_size) RETURN_STATUS(Status::Fail);
158 |     GPU_RETURN_STATUS(GPUMemcpyDtoH(
159 |         out, (GPUDevicePtr_t)this->storage[idx], storage_size
160 |     ));
161 |     return Status::Succ;
162 | }
163 | 
164 | Status ExecutorBase::find_storage_idx(const std::string& name, size_t& idx) {
165 |     // TODO: O(n) -> O(1)
166 |     for (size_t i = 0; i < this->storage.size(); i++) {
167 |         StorageInfo& storage_info = this->model->storage[i];
168 |         if (storage_info.name == name) {
169 |             idx = i;
170 |             return Status::Succ;
171 |         }
172 |     }
173 |     RETURN_STATUS(Status::NotFound);
174 |     return Status::NotFound; // otherwise, the compiler thinks no return value.
175 | }
176 | 
177 | size_t ExecutorBase::num_kernels() const {
178 |     return model->kernels.size();
179 | }
180 | 
181 | 
182 | void ExecutorBase::set_stream(GPUStream_t stream) {
183 |     s = stream;
184 | }
185 | 
186 | 
187 | GPUStream_t ExecutorBase::stream() const {
188 |     return s;
189 | }
190 | 
191 | Status ExecutorBase::execute(GPUStream_t stream) {
192 |     execute_to(num_kernels());
193 |     return Status::Succ;
194 | }
195 | 
196 | Status ExecutorBase::execute_to(int idx, GPUStream_t stream) {
197 |     for (int i = 0; i < idx; i++) {
198 |         RETURN_STATUS(launch_kernel(i, stream));
199 |     }  
200 |     GPU_RETURN_STATUS(GPUStreamSynchronize(stream));
201 |     return Status::Succ;
202 | }
203 | 
204 | Status ExecutorBase::execute_kernel(int idx, GPUStream_t stream) {
205 |     if (idx >= num_kernels()) RETURN_STATUS(Status::OutOfRange);
206 |     RETURN_STATUS(launch_kernel(idx, stream));
207 |     GPU_RETURN_STATUS(GPUStreamSynchronize(stream));
208 |     return Status::Succ;
209 | }
210 | 
211 | Status ExecutorBase::launch_kernel(int kernel_offset, GPUStream_t stream) {
212 |     int i = kernel_offset;
213 |     std::string& func_name = this->model->kernels[i].name;
214 |     GPUFunction_t func = this->kernels[func_name];
215 |     uint32_t *launch_params = this->model->kernels[i].launch_params;
216 |     // std::cout << func_name << std::endl;
217 |     GPU_RETURN_STATUS(GPUModuleLaunchKernel(func,
218 |         launch_params[0], launch_params[1], launch_params[2],
219 |         launch_params[3], launch_params[4], launch_params[5],
220 |         0, stream, (void **)this->raw_args[i].data(), 0
221 |     ));
222 |     return Status::Succ;
223 | }
224 | 
225 | } // namespace executor
226 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/executor_base.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "reef/executor/model.h"
 3 | #include "reef/util/common.h"
 4 | 
 5 | 
 6 | #ifdef __REEF_HIP_GPU__
 7 | #include "reef/executor/hip/hip_impl.h"
 8 | #endif
 9 | #ifdef __REEF_CUDA_GPU__
10 | #include "reef/executor/cuda/cuda_impl.h"
11 | #endif
12 | 
13 | namespace reef {
14 | namespace executor {
15 | 
16 | class ExecutorBase {
17 | public:
18 |     ExecutorBase();
19 |     virtual ~ExecutorBase();
20 | 
21 |     Status load_model_from_file(const char* json_file_path, const char* co_file_path);
22 | 
23 |     virtual Status load_model_from_GPU_module(const char* json_file_path, GPUModule_t module);
24 | 
25 |     Status load_param_from_file(const char* param_file_path);
26 | 
27 |     Status set_input(const std::string& key, const std::vector<float>& value);
28 | 
29 |     Status set_input(int idx, const void* value, size_t len);
30 | 
31 |     Status set_input(const std::string& key, const void* value, size_t len);
32 | 
33 |     Status get_data_size(const std::string& key, size_t &size);
34 | 
35 |     Status get_output(std::vector<float>& out);
36 | 
37 |     Status get_output(void* out, size_t len);
38 | 
39 |     Status get_data(int idx, void* out, size_t len);
40 | 
41 |     Status execute(GPUStream_t stream = GPUStreamDefault); 
42 | 
43 |     Status execute_to(int idx, GPUStream_t stream = GPUStreamDefault);
44 | 
45 |     Status execute_kernel(int idx, GPUStream_t stream = GPUStreamDefault);
46 | 
47 |     size_t num_kernels() const;
48 | 
49 |     void set_stream(GPUStream_t stream);
50 | 
51 |     GPUStream_t stream() const;
52 |     
53 |     std::unique_ptr<Model> model;
54 | protected:
55 |     Status init_executor_base(const char* json_file_path, GPUModule_t module);
56 | 
57 |     virtual Status launch_kernel(int kernel_offset, GPUStream_t stream);
58 | 
59 |     Status find_storage_idx(const std::string& name, size_t &idx);
60 | 
61 | protected:
62 | 
63 |     std::vector<GPUDevicePtr_t> storage;
64 |     std::unordered_map<std::string, GPUFunction_t> kernels;
65 |     std::vector<std::vector<GPUDevicePtr_t*>> raw_args;
66 | 
67 |     GPUModule_t base_mod;
68 |     GPUStream_t s;
69 | };
70 | 
71 | 
72 | } // namespace executor
73 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/hip/hip_impl.cpp:
--------------------------------------------------------------------------------
 1 | #include "reef/executor/hip/hip_impl.h"
 2 | #include "reef/util/common.h"
 3 | 
 4 | #include <glog/logging.h>
 5 | 
 6 | namespace reef {
 7 | namespace executor {
 8 | 
 9 | uint32_t GPUConfig::get_num_cus() {
10 |     // TODO: dynamic load CU nums
11 |     return 60;
12 | }
13 | 
14 | 
15 | 
16 | Status GPUConfig::get_kernel_address(const char* name, GPUModule_t mod, GPUFunctionPtr_t& ret) {
17 |     hipFunction_t temp;
18 |     GPU_RETURN_STATUS(hipModuleGetFunction(&temp, mod, name));
19 |     hipFunctionWGInfo_t wgInfo;
20 |     GPU_RETURN_STATUS(hipFuncGetWGInfo(temp, &wgInfo));
21 |     hipDeviceptr_t temp_buf;
22 |     GPU_RETURN_STATUS(hipMalloc(&temp_buf, 64));
23 |     int buf[24];
24 |     int size = 24;
25 | 
26 |     GPU_RETURN_STATUS(hipMemcpyDtoD(temp_buf, (hipDeviceptr_t)wgInfo.baseAddress, size));
27 |     GPU_RETURN_STATUS(hipMemcpy(buf, temp_buf, size, hipMemcpyDeviceToHost));
28 |     GPU_RETURN_STATUS(hipFree(temp_buf));
29 | 
30 |     ret = wgInfo.baseAddress + *(long long int*)(&buf[4]);
31 |     return Status::Succ;  
32 | }
33 | 
34 | Status GPUConfig::get_kernel_resource(GPUFunction_t func, KernelResource& ret) {
35 |     hipFunctionWGInfo_t wg_info;
36 |     GPU_RETURN_STATUS(hipFuncGetWGInfo(func, &wg_info));
37 |     ret.shared_memory = wg_info.usedLDSSize_;
38 |     ret.vgprs = wg_info.usedVGPRs_;
39 |     ret.sgprs = wg_info.usedSGPRs_;
40 |     ret.stack_size = wg_info.privateMemSize_;
41 |     return Status::Succ;
42 | }
43 | 
44 | 
45 | GPUConfig::KernelResource GPUConfig::max_resource(
46 |     const KernelResource& kr1, const KernelResource& kr2) {
47 |     KernelResource ret;
48 |     ret.sgprs = std::max(kr1.sgprs, kr2.sgprs);
49 |     ret.vgprs = std::max(kr1.vgprs, kr2.vgprs);
50 |     ret.shared_memory = std::max(kr1.shared_memory, kr2.shared_memory);
51 |     ret.stack_size = std::max(kr1.stack_size, kr2.stack_size);
52 |     return ret;
53 | }
54 | 
55 | int GPUConfig::calculate_occupancy(const KernelResource& resource, dim3 block_dim) {
56 |     int vgprs = align_up(resource.vgprs, 4);
57 |     int sgprs = align_up(resource.sgprs, 8);
58 |     int shared_mem = align_up(resource.shared_memory, 256);
59 |     int block_size = (int)align_up<unsigned int>(block_dim.x * block_dim.y * block_dim.z, 64);
60 | 
61 |     int max_gpr_waves = (16 * 1024 / (vgprs * 64)) * 4;
62 |     max_gpr_waves = std::min(max_gpr_waves, (800 / sgprs) * 4);
63 |     max_gpr_waves = std::min(max_gpr_waves, 40);
64 |     
65 |     int max_gpr_blocks = max_gpr_waves * 64 / block_size;
66 |     int max_shared_mem_blocks = 64 * 1024 / block_size;
67 | 
68 |     int max_thread_blocks = 2048 / block_size;
69 |     
70 |     int occupancy = std::min(max_gpr_blocks, max_shared_mem_blocks);
71 |     occupancy = std::min(occupancy, max_thread_blocks);
72 |     
73 |     return occupancy;
74 | }
75 | 
76 | 
77 | bool GPUStreamEmpty(GPUStream_t s) {
78 |     hipError_t res = hipStreamQuery(s);
79 |     return hipSuccess == res;
80 | }
81 | 
82 | 
83 | } // namespace executor
84 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/hip/hip_impl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <hip/hip_runtime.h>
 3 | #include "reef/util/common.h"
 4 | 
 5 | #define GPUInit hipInit
 6 | #define GPUDeviceGet hipDeviceGet
 7 | #define GPUModuleLoad hipModuleLoad
 8 | #define GPUModuleGetFunction hipModuleGetFunction
 9 | #define GPUMalloc hipMalloc
10 | #define GPUMemcpyHtoD hipMemcpyHtoD
11 | #define GPUMemcpyDtoH hipMemcpyDtoH
12 | #define GPUModuleLaunchKernel hipModuleLaunchKernel
13 | #define GPUStreamDefault hipStreamDefault
14 | #define GPUStreamSynchronize hipStreamSynchronize
15 | #define GPUDeviceSynchronize hipDeviceSynchronize
16 | #define GPUStreamCreate hipStreamCreate
17 | #define GPUStreamQuery hipStreamQuery
18 | #define GPUStatusOK hipSuccess
19 | #define GPUFree hipFree
20 | #define GPUWriteValue32Async hipStreamWriteValue32
21 | #define GPUClearHostQueue hipStreamClearQueue
22 | #define GPUResetCU hipResetWavefronts
23 | #define GPUMemset hipMemset
24 | 
25 | #define GPU_RETURN_STATUS(cmd) \
26 | {\
27 |     hipError_t error = cmd;\
28 |     if (error != hipSuccess) {\
29 |         LOG(ERROR) << "hip error: " << hipGetErrorString(error) << "at " << __FILE__ << ":" << __LINE__; \
30 |         return Status::Fail;\
31 |     }\
32 | }
33 | 
34 | #define ASSERT_GPU_ERROR(cmd) \
35 | {\
36 |     hipError_t error = cmd;\
37 |     if (error != hipSuccess) {\
38 |         LOG(ERROR) << "hip error: " << hipGetErrorString(error) << "at " << __FILE__ << ":" << __LINE__; \
39 |         exit(EXIT_FAILURE);\
40 |     }\
41 | }
42 | 
43 | 
44 | namespace reef {
45 | namespace executor {
46 | 
47 | typedef hipDeviceptr_t GPUDevicePtr_t;
48 | typedef hipFunction_t GPUFunction_t;
49 | typedef hipDevice_t GPUDevice_t;
50 | typedef hipModule_t GPUModule_t;
51 | typedef hipError_t GPUError_t;
52 | typedef hipStream_t GPUStream_t;
53 | 
54 | typedef unsigned long long int GPUFunctionPtr_t;
55 | 
56 | bool GPUStreamEmpty(GPUStream_t s);
57 | 
58 | class GPUConfig {
59 | public:
60 |     static uint32_t get_num_cus();
61 |     static Status get_kernel_address(const char* name, GPUModule_t mod, GPUFunctionPtr_t& ret);
62 | 
63 |     struct KernelResource {
64 |         int shared_memory;
65 |         int vgprs;
66 |         int sgprs;
67 |         int stack_size;   
68 |     };
69 | 
70 |     static KernelResource max_resource(const KernelResource& kr1, const KernelResource& kr2);
71 | 
72 |     static Status get_kernel_resource(GPUFunction_t func, KernelResource& ret);
73 | 
74 |     static int calculate_occupancy(const KernelResource& resource, dim3 thread_idx);
75 | };
76 | 
77 | } // namespace executor
78 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/hybrid_executor.cpp:
--------------------------------------------------------------------------------
  1 | #include "reef/executor/hybrid_executor.h"
  2 | 
  3 | namespace reef {
  4 | namespace executor {
  5 | 
  6 | HybridExecutor::HybridExecutor() {
  7 | 
  8 | }
  9 | 
 10 | HybridExecutor::~HybridExecutor() {
 11 | 
 12 | }
 13 | 
 14 | Status HybridExecutor::load_hybrid_model_from_file(
 15 |     const char* json_file_path,
 16 |     const char* profile_file_path,
 17 |     const char* trans_co_file_path,
 18 |     const char* preempt_co_file_path) 
 19 | {
 20 |     GPUModule_t trans_module, preempt_module;
 21 |     LOG(INFO) << std::string(trans_co_file_path);
 22 |     GPU_RETURN_STATUS(GPUModuleLoad(&trans_module, trans_co_file_path));
 23 |     GPU_RETURN_STATUS(GPUModuleLoad(&preempt_module, preempt_co_file_path));
 24 | 
 25 |     return load_hybrid_model_from_GPU_module(
 26 |         json_file_path,
 27 |         profile_file_path,
 28 |         trans_module,
 29 |         preempt_module
 30 |     );
 31 | }
 32 | 
 33 | Status HybridExecutor::load_hybrid_model_from_GPU_module(
 34 |     const char* json_file_path,
 35 |     const char* profile_file_path,
 36 |     GPUModule_t trans_module,
 37 |     GPUModule_t preempt_module)
 38 | {
 39 |     // 1. Init transformed module
 40 |     Status ret = load_model_from_GPU_module(json_file_path, trans_module);
 41 |     if (ret != Status::Succ) return ret;
 42 |     
 43 |     // 2. Init preemptable module
 44 |     return init_hybrid_executor(
 45 |                 json_file_path,
 46 |                 profile_file_path,
 47 |                 trans_module,
 48 |                 preempt_module
 49 |             );
 50 | }
 51 | 
 52 | Status HybridExecutor::init_hybrid_executor(
 53 |     const char* json_file_path,
 54 |     const char* profile_file_path,
 55 |     GPUModule_t trans_module,
 56 |     GPUModule_t preempt_module) 
 57 | {
 58 |     // TODO: load profile
 59 |     preempt_mod = preempt_module;
 60 |     trans_mod = trans_module;
 61 | 
 62 |     // 1. load preemptable kernels
 63 |     size_t num_kernels = model->kernels.size();
 64 |     preempt_kernels.resize(num_kernels);
 65 |     for (size_t i = 0; i < num_kernels; i++) {
 66 |         GPU_RETURN_STATUS(GPUModuleGetFunction(
 67 |             &preempt_kernels[i], preempt_mod, model->kernels[i].name.c_str()
 68 |         ));
 69 |     }
 70 | 
 71 |     // 2. allocate preempt flag
 72 |     GPU_RETURN_STATUS(GPUMalloc((GPUDevicePtr_t*)&preempt_flag, 4));
 73 |     int value = 0;
 74 |     GPU_RETURN_STATUS(GPUMemcpyHtoD(preempt_flag, &value, 4));
 75 |     // TODO: remove this
 76 |     GPU_RETURN_STATUS(GPUMalloc((GPUDevicePtr_t*)&task_slot_base, num_kernels*4));
 77 |     GPU_RETURN_STATUS(GPUMemset(task_slot_base, 0, num_kernels*4));
 78 | 
 79 |     // 3. prepare preemptable kernel args
 80 |     preempt_args.resize(num_kernels);
 81 |     for (int i = 0; i < num_kernels; i++) {
 82 |         auto &kernel_args = preempt_args[i];
 83 |         kernel_args.push_back(&preempt_flag);
 84 |         kernel_args.push_back(&task_slot_base);   
 85 |         auto &origin_args = raw_args[i];
 86 |         for (int j = 0; j < origin_args.size(); j++) {
 87 |             kernel_args.push_back(origin_args[j]);
 88 |         }
 89 |     }
 90 | 
 91 |     // 4. load profiles
 92 |     model_profile.reset(ModelProfile::from_json(profile_file_path));
 93 |     for (int i = 0; i < num_kernels; i++) {
 94 |         auto& kernel_arg = trans_args[i];
 95 |         kernel_arg.profile = model_profile->kernel_latency[model->kernels[i].name];
 96 |     }
 97 | 
 98 |     // 5. prepare task slots for wait-based preemption
 99 |     GPU_RETURN_STATUS(hipHostMalloc((void**)&task_slots_host, num_kernels * sizeof(int), hipHostMallocDefault));
100 |     GPU_RETURN_STATUS(hipHostMalloc((void**)&task_slots_host_empty, num_kernels * sizeof(int), hipHostMallocDefault));
101 |     memset(task_slots_host, 0, num_kernels * sizeof(int));
102 |     memset(task_slots_host_empty, 0, num_kernels * sizeof(int));
103 |     return Status::Succ;
104 | }
105 | 
106 | 
107 | Status HybridExecutor::set_preempt_flag(GPUDevicePtr_t flag) {
108 |     GPU_RETURN_STATUS(GPUFree(preempt_flag)); // TODO: avoid double free
109 |     preempt_flag = flag;
110 |     return Status::Succ;
111 | }
112 | 
113 | Status HybridExecutor::execute_preemptale(GPUStream_t stream) {
114 |     for (int i = 0; i < this->model->kernels.size(); i++) {
115 |         Status ret = launch_preempt_kernel(i, stream);
116 |         if (ret != Status::Succ) return ret;
117 |     }
118 |     GPU_RETURN_STATUS(GPUStreamSynchronize(stream));
119 |     return Status::Succ;
120 | }
121 | 
122 | Status HybridExecutor::reset_task_slot_async(int kernel_offset, GPUStream_t stream) {
123 |     GPU_RETURN_STATUS(GPUWriteValue32Async(
124 |         stream, (GPUDevicePtr_t)((char*)task_slot_base + kernel_offset * 4), 0, 0
125 |     ));
126 |     return Status::Succ;
127 | }
128 | 
129 | Status HybridExecutor::get_reset_kernel_idx(int start_inx, int& ret) {
130 |     return Status::Succ; // TODO:
131 | }
132 | 
133 | void HybridExecutor::reset_task_slots(hipStream_t stream) {
134 |     ASSERT_GPU_ERROR(hipMemcpyHtoDAsync(task_slot_base, task_slots_host_empty, 4 * this->num_kernels(), stream));
135 | }
136 | 
137 | void HybridExecutor::copy_be_kernel_offset( hipStream_t stream) {
138 |     ASSERT_GPU_ERROR(hipMemcpyDtoHAsync(task_slots_host, task_slot_base, 4 * this->num_kernels(), stream));
139 | }
140 | 
141 | int HybridExecutor::get_be_kernel_offset(int begin) {
142 |     // TODO: binary search
143 |     for (int i = begin; i < this->num_kernels(); i++) {
144 |         int finished_num = task_slots_host[i];
145 |         int required_num = trans_args[i].block_num;
146 | 
147 |         if (finished_num < required_num) return i;
148 |     }
149 |     // for (int i = begin; i > 0)
150 |     return this->num_kernels();
151 | }
152 | 
153 | Status HybridExecutor::launch_preempt_kernel(int kernel_offset, GPUStream_t stream) {
154 |     KernelArg &kernel_arg = trans_args[kernel_offset];
155 |     GPUFunction_t func = preempt_kernels[kernel_offset];
156 |     // LOG(INFO) << "launch " << kernel_offset;
157 |     GPUDevicePtr_t task_slot = (GPUDevicePtr_t)((char*)task_slot_base + kernel_offset * 4);
158 |     this->preempt_args[kernel_offset][1] = &task_slot; // TODO:
159 |     GPU_RETURN_STATUS(GPUModuleLaunchKernel(func,
160 |         kernel_arg.task_dim.x, kernel_arg.task_dim.y, kernel_arg.task_dim.z,
161 |         kernel_arg.thread_dim.x, kernel_arg.thread_dim.y, kernel_arg.thread_dim.z,
162 |         0, stream, (void**)(this->preempt_args[kernel_offset].data()), 0
163 |     ));
164 |     return Status::Succ;
165 | }
166 | 
167 | } // namespace executor
168 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/hybrid_executor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "reef/executor/trans_executor.h"
 3 | 
 4 | 
 5 | namespace reef {
 6 | namespace server {
 7 |     class REEFScheduler;
 8 | } // namespace server
 9 | namespace executor {
10 | 
11 | // HybridExecutor contains two version of the model
12 | //   (1) transformed version, which is used to perform dynamic kernel padding
13 | //   (2) preemptable version, which is used to perform reset-based preemption (for best-effort tasks).
14 | //
15 | // The transformed version is inherited from TransExecutor.
16 | // 
17 | // The preemptable version adds preemption flag based on the raw model.
18 | class HybridExecutor : public TransExecutor {
19 | 
20 | friend class server::REEFScheduler;
21 | 
22 | public:
23 |     HybridExecutor();
24 |     virtual ~HybridExecutor();
25 |     Status load_hybrid_model_from_file(
26 |         const char* json_file_path,
27 |         const char* profile_file_path,
28 |         const char* trans_co_file_path,
29 |         const char* preempt_co_file_path);
30 | 
31 |     Status load_hybrid_model_from_GPU_module(
32 |         const char* json_file_path,
33 |         const char* profile_file_path,
34 |         GPUModule_t trans_module,
35 |         GPUModule_t preempt_module
36 |     );
37 | 
38 |     Status execute_preemptale(GPUStream_t stream = GPUStreamDefault);
39 | 
40 |     Status set_preempt_flag(GPUDevicePtr_t flag);
41 | 
42 |     Status reset_task_slot_async(int kernel_offset, GPUStream_t stream);
43 |     
44 |     Status get_reset_kernel_idx(int start_inx, int& ret);
45 | 
46 |     void reset_task_slots(hipStream_t stream);
47 | 
48 |     void copy_be_kernel_offset(hipStream_t stream);
49 | 
50 |     int get_be_kernel_offset(int begin);
51 | protected:
52 |     Status init_hybrid_executor(
53 |         const char* json_file_path,
54 |         const char* profile_file_path,
55 |         GPUModule_t trans_module,
56 |         GPUModule_t preempt_module
57 |     );
58 | 
59 |     Status launch_preempt_kernel(int kernel_offset, GPUStream_t stream);
60 | 
61 | protected:
62 |     GPUModule_t preempt_mod;
63 |     GPUModule_t trans_mod;
64 | 
65 |     std::vector<GPUFunction_t> preempt_kernels;
66 |     std::vector<std::vector<GPUDevicePtr_t*>> preempt_args;
67 |     GPUDevicePtr_t preempt_flag;
68 |     GPUDevicePtr_t task_slot_base; // TODO: remove this 
69 | 
70 |     int* task_slots_host_empty;
71 |     int* task_slots_host;
72 |     
73 |     std::shared_ptr<ModelProfile> model_profile;
74 | }; 
75 | 
76 | } // namespace executor
77 | } // namespace reef 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/src/reef/executor/model.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <assert.h>
  3 | #include <sstream>
  4 | #include "reef/executor/model.h"
  5 | #include "reef/util/json.h"
  6 | 
  7 | namespace reef {
  8 | namespace executor {
  9 | 
 10 | using reef::util::JsonObject;
 11 | using reef::util::JsonParser;
 12 | 
 13 | Model* Model::from_json(const char* json_file) {
 14 |     std::ifstream fs(json_file);
 15 |     std::string tmp, str = "";
 16 | 
 17 |     while (getline(fs, tmp)) str += tmp;
 18 |     fs.close();
 19 | 
 20 |     JsonObject* jobj = JsonParser::parse(str);
 21 | 
 22 |     Model* m = new Model;
 23 |     for (auto sinfo : jobj->mval["storage"]->lval) {
 24 |         m->storage.push_back(StorageInfo{
 25 |             sinfo->mval["name"]->sval,
 26 |             sinfo->mval["size"]->ival,
 27 |             sinfo->mval["stype"]->sval
 28 |         });
 29 |     }
 30 | 
 31 |     for (auto kinfo : jobj->mval["kernels"]->lval) {
 32 |         KernelInfo k;
 33 | 
 34 |         k.name = kinfo->mval["name"]->sval;
 35 |         for (auto arg : kinfo->mval["args"]->lval)
 36 |             k.args.push_back(arg->ival);
 37 |         
 38 |         assert(kinfo->mval["launch_params"]->lval.size() == 6);
 39 |         for (int i = 0; i < 6; i++) 
 40 |             k.launch_params[i] = kinfo->mval["launch_params"]->lval[i]->ival;
 41 |             
 42 |         m->kernels.push_back(k);
 43 |     }
 44 | 
 45 |     for (auto arg : jobj->mval["args"]->lval) {
 46 |         m->args.push_back(arg->ival);
 47 |     }
 48 | 
 49 |     for (auto shared_memory : jobj->mval["shared_memory"]->mval) {
 50 |         m->shared_memory[shared_memory.first] = shared_memory.second->ival;
 51 |     }
 52 |     delete jobj;
 53 | 
 54 |     return m;
 55 | }
 56 | 
 57 | ModelProfile* ModelProfile::from_json(const char* json_file) {
 58 |     std::ifstream fs(json_file);
 59 |     std::string tmp, str = "";
 60 | 
 61 |     while (getline(fs, tmp)) str += tmp;
 62 |     fs.close();
 63 | 
 64 |     JsonObject* jobj = JsonParser::parse(str);
 65 |     ModelProfile* model_profile = new ModelProfile;
 66 |     model_profile->model_latency = jobj->mval["model_latency"]->ival;
 67 | 
 68 |     for (auto &pair : jobj->mval["kernel_latency"]->mval) {
 69 |         const std::string& kernel_name = pair.first;
 70 |         KernelProfile profile;
 71 |         auto kernel_profile = pair.second->mval;
 72 |         profile.total_latency = kernel_profile["total_latency"]->ival;
 73 |         for (auto value : kernel_profile["latency"]->lval) {
 74 |             profile.latency.push_back(value->ival);
 75 |         }
 76 |         model_profile->kernel_latency.insert({kernel_name, profile});
 77 |     }
 78 |     delete jobj;
 79 |     return model_profile;
 80 | }
 81 | 
 82 | #define PARAM_MAGIC "TVM_MODEL_PARAMS"
 83 | 
 84 | ModelParam* ModelParamParser::parse_from_file(const char* param_file) {
 85 |     FILE* fp;
 86 |     fp = fopen(param_file, "rb"); 
 87 |     char magic[sizeof(PARAM_MAGIC)];
 88 |     size_t res = fread(magic, sizeof(char), sizeof(PARAM_MAGIC), fp);
 89 |     assert(res == sizeof(PARAM_MAGIC));
 90 |     assert(std::string(magic) == PARAM_MAGIC);
 91 |     
 92 |     uint64_t params_size;
 93 |     res = fread(&params_size, sizeof(uint64_t), 1, fp);
 94 |     assert(res == 1);
 95 |     assert(params_size != 0);
 96 | 
 97 |     ModelParam* params = new ModelParam(params_size);
 98 |     for (uint64_t i = 0; i < params_size; i++) {
 99 |         char key_buf[256];
100 |         uint64_t key_len = 0;
101 |         while(true) {
102 |             char c;
103 |             res = fread(&c, sizeof(char), 1, fp);
104 |             assert(res == 1);
105 |             key_buf[key_len] = c;
106 |             key_len++;
107 |             if (c == '\0') break;
108 |         }
109 |         std::string key(key_buf);
110 |         uint64_t array_size;
111 |         res = fread(&array_size, sizeof(uint64_t), 1, fp);
112 |         assert(res == 1);
113 |         assert(array_size != 0);
114 |         std::vector<float> array(array_size);
115 |         array.resize(array_size);
116 |         res = fread(array.data(), sizeof(float), array_size, fp);
117 |         assert(res == array_size);
118 |         params->insert({key, array});
119 |     }
120 |     return params;
121 | }
122 | 
123 | std::string ModelProfile::to_json() {
124 |     std::ostringstream ss;
125 |     
126 |     ss << "{\"model_latency\":" << model_latency << ",\"kernel_latency\":{";
127 |     size_t i = 0;
128 |     for (auto pair : this->kernel_latency) {
129 |         ss << "\"" << pair.first << "\":{\"total_latency\":" << pair.second.total_latency << ", \"latency\":[" ;
130 |         
131 |         size_t j = 0;
132 |         for (auto latency : pair.second.latency) {
133 |             ss << latency;
134 |             j++;
135 |             if (j != pair.second.latency.size()) {
136 |                 ss << ",";
137 |             }
138 |         }
139 |         ss << "]}";
140 |         i++;
141 |         if (i != this->kernel_latency.size()) ss << ",";
142 |     }
143 |     ss << "}}";
144 |     return ss.str();
145 | }
146 | 
147 | size_t Model::get_stype_size(std::string &stype) {
148 |     if (stype == "float32") return 4;
149 |     if (stype == "int64") return 8;
150 |     if (stype == "byte") return 1;
151 |     if (stype == "uint1") return 1;
152 |     if (stype == "int32") return 4;
153 |     std::cout << stype << " is undefined" << std::endl;
154 |     assert(false);
155 |     return 0;
156 | }
157 | 
158 | } // namespace executor
159 | } // namepsace reef


--------------------------------------------------------------------------------
/src/reef/executor/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <map>
 7 | #include <fstream>
 8 | #include <iostream>
 9 | #include <unordered_map>
10 | 
11 | namespace reef {
12 | namespace executor {
13 | 
14 | class StorageInfo {
15 | public:
16 |     std::string name;
17 |     size_t size;
18 |     std::string stype;
19 | };
20 | 
21 | class KernelInfo {
22 | public:
23 |     std::string name;
24 |     uint32_t launch_params[6];
25 |     std::vector<size_t> args;
26 | };
27 | 
28 | class Model {
29 | public:
30 |     std::vector<StorageInfo> storage;
31 |     std::vector<KernelInfo> kernels;
32 |     std::vector<uint32_t> args;
33 |     std::unordered_map<std::string, size_t> shared_memory;
34 | 
35 | public:
36 |     static Model* from_json(const char* json_file);
37 |     static size_t get_stype_size(std::string &stype);
38 | };
39 | 
40 | class KernelProfile {
41 | public:
42 |     std::vector<int> latency; // microsecond
43 |     int total_latency;
44 |     int estimated_latency(int occupancy, int task_num_per_block);
45 | };
46 | 
47 | class ModelProfile {
48 | public:
49 |     int model_latency;
50 |     std::unordered_map<std::string, KernelProfile> kernel_latency;
51 |     std::string to_json();
52 |     static ModelProfile* from_json(const char* json_file);
53 | };
54 | 
55 | typedef std::unordered_map<std::string, std::vector<float>> ModelParam;
56 | 
57 | class ModelParamParser {
58 | public:
59 |     static ModelParam* parse_from_file(const char* param_file);
60 | };
61 | 
62 | 
63 | 
64 | } // namespace executor
65 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/trans_executor.cpp:
--------------------------------------------------------------------------------
  1 | #include "reef/executor/trans_executor.h"
  2 | 
  3 | namespace reef {
  4 | namespace executor {
  5 | 
  6 | TransExecutor::TransExecutor() {}
  7 | TransExecutor::~TransExecutor() {}
  8 | 
  9 | Status TransExecutor::load_model_from_GPU_module(const char* json_file_path, GPUModule_t module) {
 10 |     Status ret = init_executor_base(json_file_path, module);
 11 |     if (ret != Status::Succ) return ret;
 12 |     return init_rt_executor(json_file_path, module);
 13 | }
 14 | 
 15 | 
 16 | Status TransExecutor::init_rt_executor(const char* json_file_path, GPUModule_t module) {
 17 |     size_t num_kernel_calls = model->kernels.size();
 18 |     int num_cus = GPUConfig::get_num_cus();
 19 |     trans_args.resize(num_kernel_calls);
 20 | 
 21 |     bool need_load_kernels = true; // TODO: move to class config
 22 | 
 23 |     // 1. fullfil the trans_args, which will be used to launch transformed kernels
 24 |     for (size_t i = 0; i < num_kernel_calls; i++) {
 25 |         KernelArg &kernel_arg = trans_args[i];
 26 |         std::string& kernel_name = model->kernels[i].name;
 27 | 
 28 |         uint32_t *launch_params = model->kernels[i].launch_params;
 29 |         kernel_arg.task_dim = dim3(launch_params[0],launch_params[1], launch_params[2]);
 30 |         kernel_arg.thread_dim = dim3(launch_params[3],launch_params[4], launch_params[5]);
 31 |         kernel_arg.block_num = launch_params[0] * launch_params[1] * launch_params[2];
 32 |         kernel_arg.block_offset = 0;
 33 |         kernel_arg.cu_lower = 0;
 34 |         kernel_arg.cu_upper = GPUConfig::get_num_cus();
 35 | 
 36 |         if (need_load_kernels) {
 37 |             RETURN_STATUS(
 38 |                 GPUConfig::get_kernel_address(
 39 |                     kernel_name.c_str(), module, kernel_arg.funcion_pointer
 40 |                 )
 41 |             );
 42 |             kernel_arg.kernel = kernels[kernel_name];
 43 |             RETURN_STATUS(
 44 |                 GPUConfig::get_kernel_resource(
 45 |                     kernel_arg.kernel,
 46 |                     kernel_arg.resource
 47 |                 );
 48 |             )
 49 |         }
 50 |     }
 51 | 
 52 |     // 2. prepare REAL kernel params (model params)
 53 |     size_t num_total_kernel_args = 0;
 54 |     size_t func_args_ptr_buffer_size = 0;
 55 |     for (size_t i = 0; i < num_kernel_calls; i++) {
 56 |         num_total_kernel_args += raw_args[i].size();
 57 |     }
 58 | 
 59 |     func_args_ptr_buffer_size = align_up(num_total_kernel_args * sizeof(float *), (size_t)4096);
 60 | 
 61 |     GPU_RETURN_STATUS(
 62 |         GPUMalloc((GPUDevicePtr_t*)&func_args_base_ptr, func_args_ptr_buffer_size)
 63 |     );
 64 |     size_t func_args_offset = 0;
 65 |     
 66 |     std::vector<GPUDevicePtr_t> host_args(num_total_kernel_args);
 67 | 
 68 |     for (size_t i = 0; i < num_kernel_calls; i++) {
 69 |         KernelArg &kernel_arg = trans_args[i];
 70 |         kernel_arg.args = (GPUDevicePtr_t)(
 71 |             (size_t)func_args_base_ptr + func_args_offset * sizeof(float*)
 72 |         );
 73 |         for (size_t arg_idx : model->kernels[i].args) {
 74 |             host_args[func_args_offset] = storage[arg_idx];
 75 |             func_args_offset ++;
 76 |         }
 77 |     }
 78 | 
 79 |     GPU_RETURN_STATUS(
 80 |         GPUMemcpyHtoD((GPUDevicePtr_t)func_args_base_ptr, (void*)host_args.data(), num_total_kernel_args * sizeof(float*))
 81 |     );
 82 | 
 83 |     // 3. calculate num_layers
 84 |     if (need_load_kernels) {
 85 |         for (size_t i = 0; i < num_kernel_calls; i++) {
 86 |             KernelArg &kernel_arg = trans_args[i];
 87 |             KernelInfo &info = model->kernels[i];
 88 |             std::string &kernel_name = info.name;
 89 |             GPUFunction_t func = kernels[kernel_name];
 90 | 
 91 |             int max_layers = GPUConfig::calculate_occupancy(
 92 |                 kernel_arg.resource,
 93 |                 kernel_arg.thread_dim
 94 |             );
 95 |             int num_layers = align_up(kernel_arg.block_num, num_cus) / num_cus;
 96 |             if (num_layers > max_layers) num_layers = max_layers;
 97 |             kernel_arg.min_occupancy = num_layers;
 98 |         }
 99 |     }
100 | 
101 |     // 4. prepare proxy kernels
102 |     proxy_kernels.resize(10);
103 |     proxy_kernels_nostack.resize(10);
104 |     for (int i = 1; i <= 10; i++) {
105 |         {
106 |             std::stringstream kernel_name;
107 |             kernel_name << REEF_PROXY_KERNEL_PREFIX() << i;
108 | 
109 |             GPUFunction_t proxy_kernel;
110 |             GPU_RETURN_STATUS(GPUModuleGetFunction(
111 |                 &proxy_kernel, module, kernel_name.str().c_str())
112 |             );
113 |             proxy_kernels[i-1] = proxy_kernel;
114 |         }
115 | 
116 |         {
117 |             std::stringstream kernel_name;
118 |             kernel_name << REEF_PROXY_KERNEL_NOSTACK_PREFIX() << i;
119 | 
120 |             GPUFunction_t proxy_kernel;
121 |             GPU_RETURN_STATUS(GPUModuleGetFunction(
122 |                 &proxy_kernel, module, kernel_name.str().c_str()));
123 |             proxy_kernels_nostack[i-1] = proxy_kernel;
124 |         }
125 |     }
126 |     GPUConfig::KernelResource kr;
127 |     RETURN_STATUS(GPUConfig::get_kernel_resource(proxy_kernels[0], kr));
128 |     max_stack_size = kr.stack_size; // TODO: move to GPU interface
129 |     return Status::Succ;
130 | }
131 | 
132 | Status TransExecutor::launch_kernel(int kernel_offset, GPUStream_t stream) {
133 |     std::string& func_name = this->model->kernels[kernel_offset].name;
134 |     GPUFunction_t func = this->kernels[func_name];
135 |     int num_cus = GPUConfig::get_num_cus();
136 |     uint32_t *launch_params = this->model->kernels[kernel_offset].launch_params;
137 | 
138 |     KernelArg &kernel_arg = this->trans_args[kernel_offset];
139 |     int logical_layers = align_up(kernel_arg.block_num, num_cus) / num_cus;
140 |     int cu_partition = align_up(kernel_arg.block_num, logical_layers) / logical_layers;
141 |     void* placeholder = nullptr;
142 | 
143 |     void *arg[] = {
144 |         &placeholder,
145 |         &(kernel_arg.min_occupancy),
146 |         &(kernel_arg.block_num),
147 |         &(kernel_arg.block_offset),
148 |         &(kernel_arg.args),
149 | 
150 |         // These args are not actually used.
151 |         &placeholder,
152 |         &(kernel_arg.min_occupancy),
153 |         &(kernel_arg.block_num),
154 |         &(kernel_arg.block_offset),
155 |         &(kernel_arg.args),
156 | 
157 |         
158 |         &(kernel_arg.cu_upper),
159 |     };
160 |     // assert(this->model->shared_memory.find(func_name) != this->model->shared_memory.end());
161 |     // std::cout << "shared: " << this->base_executor->model->shared_memory[func_name] << std::endl;
162 |     // unsigned int logical_work_groups = launch_params[0] * launch_params[1] * launch_params[2];
163 |     // unsigned int num_layers = align_up(logical_work_groups, (unsigned int) GPU_NUM_CU) / GPU_NUM_CU;
164 |     // unsigned int physical_work_groups = num_layers * GPU_NUM_CU; // align_up(logical_work_groups, num_layers) / num_layers;
165 |     
166 |     // std::cout << func_name << std::endl;
167 |     GPU_RETURN_STATUS(GPUModuleLaunchKernel(func,
168 |         num_cus * kernel_arg.min_occupancy, 1, 1,
169 |         launch_params[3] * launch_params[4] * launch_params[5], 1, 1,
170 |         128, stream, arg, 0
171 |     ));        
172 |     return Status::Succ;
173 | }
174 | 
175 | GPUFunction_t TransExecutor::get_proxy_kernel(const GPUConfig::KernelResource& kr) {
176 |     // FIXME: move to GPU interface
177 |     static int sgpr_bound[] = {    
178 |         102, 102, 102, 102, 102,
179 |         102, 102, 102, 88, 80
180 |     };
181 |     
182 |     static int vgpr_bound[] = {
183 |         256, 128, 84, 64, 48,
184 |         40, 36, 32, 28, 28
185 |     };
186 |     int sgpr_idx = 0, vgpr_idx = 0;
187 |     int occupancy = 10;
188 |     for (int i = 1; i < 10; i++) {
189 |         if (kr.vgprs > vgpr_bound[i] || kr.sgprs > sgpr_bound[i]) {
190 |             assert(i > 0);
191 |             occupancy = i;
192 |             break;        
193 |         }
194 |     }
195 |     if (kr.stack_size > 0)
196 |         return proxy_kernels[occupancy - 1];
197 |     else
198 |         return proxy_kernels_nostack[occupancy - 1];
199 | }
200 | 
201 | } // namespace executor
202 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/executor/trans_executor.h:
--------------------------------------------------------------------------------
 1 | #pragma once 
 2 | #include "reef/executor/executor_base.h"
 3 | 
 4 | 
 5 | 
 6 | namespace reef {
 7 | namespace executor {
 8 | 
 9 | 
10 | // TransExecutor is used for both real-time tasks and best-effort tasks.
11 | // Instead of using the raw GPU code from DL compiler, TransExecutor executes
12 | // transformed GPU code that support dynamic kernel padding.
13 | // There are mainly two transformations:
14 | // 1. kernel args
15 | //   The original kernel looks like:
16 | //
17 | //   __global__ void foo(float* a, float* b) { ... }
18 | //
19 | //   The transformed kernel looks like:
20 | //
21 | //   __global__ void foo(   
22 | //     void* func_l, int layers_l, int task_num_l, int task_offset_l, float** param_l,
23 | //     void* func_r, int layers_r, int task_num_r, int task_offset_r, float** param_r,
24 | //     int cu_partition) { ... }
25 | //
26 | //   The param `param_l` and `param_r` should consist the original kernel arg `float* a, flaot* b`.
27 | //
28 | //   The execution of the transformed kernel must follow the `persistent thread` style.
29 | //
30 | //   TODO: currently, dynamic kernel padding only support 2 kernels with float params. 
31 | //
32 | // 2. proxy kernel
33 | //   The new transformed kernels can be called by two ways: 
34 | //     (1) directly launch the new kernel with new params (usually for test).
35 | //     (2) through proxy kernel
36 | //   
37 | //   Currently, proxy kernel has the same args with transformed kernels.
38 | 
39 | class TransExecutor : public ExecutorBase {
40 | public:
41 |     TransExecutor();
42 |     virtual ~TransExecutor();
43 |     virtual Status load_model_from_GPU_module(const char* json_file_path, GPUModule_t module) override;
44 | 
45 | public:
46 | 
47 |     class KernelArg {
48 |     public:
49 |         GPUFunction_t kernel;
50 |         GPUFunctionPtr_t funcion_pointer;
51 |         dim3 task_dim;
52 |         dim3 thread_dim;
53 |         // GPUDeviceptr_t task_slots;
54 |         int block_num;
55 |         int block_offset;
56 |         int cu_lower;
57 |         int cu_upper;
58 |         GPUDevicePtr_t args;
59 | 
60 |         int min_occupancy; // This is the minimal required occupancy for real-time task.
61 |         GPUConfig::KernelResource resource;
62 |         KernelProfile profile;    
63 |     };
64 | 
65 | protected:
66 |     Status init_rt_executor(const char* json_file_path, GPUModule_t module);
67 | 
68 |     virtual Status launch_kernel(int kernel_offset, GPUStream_t stream) override;
69 |     
70 |     GPUFunction_t get_proxy_kernel(const GPUConfig::KernelResource& kr);
71 | protected:
72 |     std::vector<KernelArg> trans_args;
73 |     GPUDevicePtr_t func_args_base_ptr;
74 | 
75 |     std::vector<GPUFunction_t> proxy_kernels;
76 |     std::vector<GPUFunction_t> proxy_kernels_nostack;
77 |     int max_stack_size;
78 | 
79 |     virtual std::string REEF_PROXY_KERNEL_PREFIX() const {
80 |         return "merge_framework_";
81 |     } 
82 |     virtual std::string REEF_PROXY_KERNEL_NOSTACK_PREFIX() const {
83 |         return "merge_framework_nostack_";
84 |     }
85 | 
86 | };
87 | 
88 | } // namespace executor
89 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/protos/reef.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package reef.rpc;
 4 | 
 5 | service REEFService {
 6 |   // load a DNN model
 7 |   rpc LoadModel (LoadModelRequest) returns (LoadModelReply) {}
 8 | 
 9 |   // create a Task queue
10 |   rpc SetPriority (SetPriorityRequest) returns (SetPriorityReply) {} 
11 | 
12 |   // register shared memory
13 |   rpc RegisterBlob (RegisterBlobRequest) returns (RegisterBlobReply) {}
14 | 
15 |   // memcpy device to host
16 |   rpc GetBlob (GetBlobRequest) returns (GetBlobReply) {}
17 | 
18 |   // memcpy host to device
19 |   rpc SetBlob (SetBlobRequest) returns (SetBlobReply) {}
20 | 
21 |   // create an inference task
22 |   rpc Infer (InferRequest) returns (InferReply) {}
23 | }
24 | 
25 | message LoadModelRequest {
26 |   string dir = 1;
27 |   string name = 2;
28 |   int32 qid = 3;
29 | }
30 | 
31 | message LoadModelReply {
32 |   bool succ = 1; // TODO: enums
33 |   int32 mid = 2;
34 | }
35 | 
36 | message SetPriorityRequest {
37 |   bool rt = 1;
38 | }
39 | 
40 | message SetPriorityReply {
41 |   bool succ = 1;
42 |   int32 qid = 2;
43 | }
44 | 
45 | message RegisterBlobRequest {
46 |   int32 mid = 1;
47 |   string name = 2;
48 | }
49 | 
50 | message RegisterBlobReply {
51 |   bool succ = 1;
52 |   string key = 2;
53 |   int64 size = 3;
54 | }
55 | 
56 | message GetBlobRequest {
57 |   string key = 1;
58 | }
59 | 
60 | message GetBlobReply {
61 |   bool succ = 1; 
62 | }
63 | 
64 | message SetBlobRequest {
65 |   string key = 1;
66 | }
67 | 
68 | message SetBlobReply {
69 |   bool succ = 1; 
70 | }
71 | 
72 | message InferRequest {
73 |   int32 mid = 1;
74 |   bool async = 2;
75 | }
76 | 
77 | message InferReply {
78 |   bool succ = 1;
79 |   int32 tid = 2;
80 | }


--------------------------------------------------------------------------------
/src/reef/rpc/placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SJTU-IPADS/reef/58dabe0a63fe6979349b358a78aa324cca050e4a/src/reef/rpc/placeholder


--------------------------------------------------------------------------------
/src/reef/server/scheduler.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <mutex>
  5 | #include <condition_variable>
  6 | #include <queue>
  7 | 
  8 | #include "reef/util/threadsafe_queue.h"
  9 | #include "reef/util/common.h"
 10 | #include "reef/executor/hybrid_executor.h"
 11 | 
 12 | namespace reef {
 13 | namespace server {
 14 | 
 15 | class REEFScheduler {
 16 |     class Model;
 17 | public:
 18 |     typedef uint32_t ModelID;
 19 |     typedef uint32_t QueueID;
 20 |     typedef uint32_t TaskID;
 21 |     
 22 |     enum ScheduleMode {
 23 |         NoPreempt, // no preemption
 24 |         MultiStream, // multiple GPU streams
 25 |         WaitPreempt, // wait-based preemption
 26 |         REEF,
 27 |         Reset // reset-based preemption without DKP
 28 |     };
 29 | 
 30 |     enum TaskQueueType {
 31 |         RealTimeQueue,
 32 |         BestEffortQueue,
 33 |     };
 34 | 
 35 |     enum TaskState {
 36 |         Init,
 37 |         Waiting,
 38 |         Executing,
 39 |         Finish    
 40 |     };
 41 | 
 42 |     struct Task {
 43 |         friend REEFScheduler;
 44 |     private:
 45 |         std::shared_ptr<Model> model;
 46 |         QueueID qid;
 47 |         TaskID id;
 48 |         volatile TaskState state;
 49 |         int launch_offset; // the kernel idx that has been launched to host queue
 50 |         int kernel_offset; // the kernel idx that has been executed
 51 |         int block_offset; // for DKP
 52 |         std::mutex mtx;
 53 |         std::condition_variable cv;
 54 |         std::chrono::system_clock::time_point submit; // when this task is created
 55 |         std::chrono::system_clock::time_point start; // when this task is scheduled
 56 |         std::chrono::system_clock::time_point end; // when this task is completed
 57 |         bool preempted;
 58 |         bool padding;
 59 |         bool padding_to_finish;
 60 |     public:
 61 |         bool is_preempted() const;
 62 |         bool is_padded() const;
 63 |         bool is_padded_to_complete() const;
 64 |         std::vector<std::chrono::system_clock::time_point> get_timestamp() const;
 65 |     };
 66 | 
 67 | public:
 68 |     REEFScheduler(ScheduleMode _mode = ScheduleMode::REEF);
 69 |     ~REEFScheduler();
 70 | 
 71 |     Status load_model(
 72 |         const std::string& model_dir,
 73 |         const std::string& model_name,
 74 |         ModelID& mid
 75 |     );
 76 | 
 77 |     Status load_model(
 78 |         const std::string& rt_co_path,
 79 |         const std::string& be_co_path,
 80 |         const std::string& json_path,
 81 |         const std::string& profile_path,
 82 |         const std::string& param_path,
 83 |         ModelID& mid
 84 |     );
 85 | 
 86 |     Status create_queue(
 87 |         const TaskQueueType& qtp,
 88 |         QueueID& qid
 89 |     );
 90 | 
 91 |     Status bind_model_queue(
 92 |         const QueueID& qid,
 93 |         const ModelID& mid
 94 |     );
 95 | 
 96 |     Status get_data_size(ModelID mid, const std::string& name, size_t& size);
 97 | 
 98 |     Status set_input(ModelID mid, const void* data, size_t len, const std::string& name="data");
 99 | 
100 |     Status get_output(ModelID mid, void* data, size_t len, const std::string& name="output");
101 | 
102 |     Status new_task(
103 |         const ModelID& mid,
104 |         TaskID& tid
105 |     );
106 | 
107 |     Status wait_task(
108 |         TaskID tid
109 |     );
110 | 
111 |     Status get_task(
112 |         TaskID tid,
113 |         std::shared_ptr<Task>& t
114 |     );
115 | 
116 |     ScheduleMode sche_mode() const;
117 | 
118 |     void set_wait_sync(bool value);
119 | 
120 |     void set_be_stream_cap(int value);
121 |     Status run();
122 |     Status shutdown();
123 | 
124 |     int64_t avg_preempt_latency() const;
125 |     
126 |     int64_t avg_kernel_sel_latency() const;
127 | private:
128 |     ScheduleMode mode;
129 |     const size_t model_pool_capacity = 1024;
130 |     std::atomic_uint32_t model_pool_size;
131 |     struct Model {
132 |         executor::HybridExecutor executor;
133 |         QueueID qid;
134 |     };
135 |     std::vector<std::shared_ptr<Model>> model_pool;
136 | 
137 | 
138 |     std::atomic_uint32_t task_idx_pool;
139 |     std::unordered_map<TaskID, std::shared_ptr<Task>> task_pool;
140 |     std::mutex task_pool_mtx;
141 | 
142 |     struct TaskQueue {
143 |         ThreadSafeQueue<std::shared_ptr<Task>> task_queue;
144 |         executor::GPUStream_t stream;
145 |     };
146 | 
147 |     const size_t max_num_be_queues = 32;
148 |     const QueueID rt_queue_id = 32; // the same with be queue num
149 |     std::mutex be_queues_mtx;
150 |     std::vector<std::shared_ptr<TaskQueue>> be_queues;
151 |     volatile uint32_t be_queue_cnt;
152 |     std::shared_ptr<TaskQueue> rt_queue;
153 |     std::mutex task_cnt_mtx;
154 |     std::condition_variable task_cnt_cv; // To wake up the scheduler
155 |     volatile uint32_t task_cnt;
156 |     bool wait_sync;
157 | 
158 |     std::unique_ptr<std::thread> scheduler;
159 |     executor::GPUStream_t execute_stream, preempt_stream;
160 |     executor::GPUDevicePtr_t preempt_flag;
161 |     bool preempted;
162 |     int be_stream_device_queue_cap;
163 |     std::atomic_bool _shutdown;
164 | 
165 |     uint64_t preempt_count;
166 |     uint64_t preempt_latency_sum;
167 | 
168 |     uint64_t kernel_sel_count;
169 |     uint64_t kernel_sel_latency_sum;
170 | private:
171 |     Status create_task_queue(std::shared_ptr<TaskQueue>& ret, bool rt);
172 |     void loop_body();
173 |     void execute_be_task(std::shared_ptr<Task>& task, std::shared_ptr<TaskQueue>& tqueue);
174 |     void execute_rt_task(std::shared_ptr<Task>& task);
175 |     void preempt_be_tasks();
176 |     void reset_preempt_flag_async();
177 |     void preempt_reset();
178 |     void preempt_wait();
179 |     void dynamic_kernel_padding(std::shared_ptr<Task>& rt_task);
180 |     executor::GPUFunction_t get_proxy_kernel(
181 |         const executor::GPUConfig::KernelResource& resource, 
182 |         executor::HybridExecutor* rt_executor,
183 |         executor::HybridExecutor* be_executor
184 |     );
185 | };
186 | 
187 | 
188 | } // namespace server
189 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/server/server.cpp:
--------------------------------------------------------------------------------
  1 | #include "reef/server/server.h"
  2 | #include <grpcpp/grpcpp.h>
  3 | #include <glog/logging.h>
  4 | 
  5 | namespace reef {
  6 | namespace server {
  7 | 
  8 | REEFServer::REEFServer(const std::string& addr) 
  9 |     : server_addr(addr), rpc_server(nullptr)
 10 | {
 11 |     scheduler.reset(new REEFScheduler());
 12 | }
 13 | 
 14 | void REEFServer::run() {
 15 |     grpc::ServerBuilder builder;
 16 |     builder.AddListeningPort(server_addr, grpc::InsecureServerCredentials());
 17 |     builder.RegisterService(this);
 18 | 
 19 |     rpc_server = builder.BuildAndStart();
 20 |     scheduler->run();
 21 | }
 22 | 
 23 | void REEFServer::wait() {
 24 |     ASSERT(rpc_server.get() != nullptr);
 25 |     rpc_server->Wait();
 26 | }
 27 | 
 28 | void REEFServer::shutdown() {
 29 |     ASSERT(rpc_server.get() != nullptr); 
 30 |     rpc_server->Shutdown();
 31 |     scheduler->shutdown();
 32 | }
 33 | 
 34 | grpc::Status REEFServer::SetPriority(
 35 |     grpc::ServerContext *context,
 36 |     const reef::rpc::SetPriorityRequest *request,
 37 |     reef::rpc::SetPriorityReply *reply
 38 | ) {
 39 |     LOG(INFO) << "new client, real_time: " << request->rt();
 40 |     // create queue
 41 |     REEFScheduler::QueueID qid;
 42 |     Status s = scheduler->create_queue(
 43 |         request->rt() ? 
 44 |             REEFScheduler::TaskQueueType::RealTimeQueue
 45 |             : REEFScheduler::TaskQueueType::BestEffortQueue, 
 46 |         qid
 47 |     );
 48 |     if (s != Status::Succ)
 49 |         reply->set_succ(false);
 50 |     else {
 51 |         reply->set_succ(true);
 52 |         reply->set_qid(qid);
 53 |     }
 54 |     return grpc::Status::OK;
 55 | }
 56 | 
 57 | grpc::Status REEFServer::LoadModel(
 58 |     grpc::ServerContext *context,
 59 |     const reef::rpc::LoadModelRequest *request,
 60 |     reef::rpc::LoadModelReply *reply
 61 | ) {
 62 |     LOG(INFO) << "load model: " << request->name() << ", qid: " << request->qid();
 63 |     std::string prefix = request->dir() + "/" + request->name();
 64 |     std::string param_file = prefix + ".param";
 65 |     if (access(param_file.c_str(), F_OK) == -1) {
 66 |         param_file = "";
 67 |         LOG(INFO) << request->name() << " no param file";
 68 |     }
 69 | 
 70 |     REEFScheduler::ModelID mid;
 71 |     Status s = scheduler->load_model(
 72 |         prefix + ".trans.co",
 73 |         prefix + ".be.co",
 74 |         prefix + ".json",
 75 |         prefix + ".profile.json",
 76 |         param_file, // TODO: load param
 77 |         mid
 78 |     );
 79 |     if (s != Status::Succ) {
 80 |         reply->set_succ(false);
 81 |         return grpc::Status::OK;
 82 |     } else {
 83 |         reply->set_mid(mid);
 84 |     }
 85 |     s = scheduler->bind_model_queue(request->qid(), mid);
 86 |     if (s != Status::Succ) {
 87 |         reply->set_succ(false); // TODO: unload model
 88 |         return grpc::Status::OK;
 89 |     } else {
 90 |         reply->set_succ(true);
 91 |     }
 92 |     return grpc::Status::OK;
 93 | }
 94 | 
 95 | grpc::Status REEFServer::RegisterBlob(
 96 |     grpc::ServerContext *context,
 97 |     const reef::rpc::RegisterBlobRequest *request,
 98 |     reef::rpc::RegisterBlobReply *reply
 99 | ) {
100 |     reply->set_succ(false);
101 |     size_t size;
102 |     auto s = scheduler->get_data_size(request->mid(), request->name(), size);
103 |     if (s != Status::Succ) return grpc::Status::OK;
104 |     std::string key = std::to_string(request->mid()) + "_" + request->name();
105 |     reply->set_key(key);
106 |     reply->set_size(size);
107 |     reply->set_succ(true);
108 |     {
109 |         std::unique_lock<std::mutex> lock(shm_mtx);
110 |         auto iter = shms.find(key);
111 |         if (iter == shms.end()) {
112 |             auto shm = std::make_shared<util::SharedMemory>(key, size, true);
113 |             SharedMemoryInfo shminfo;
114 |             shminfo.name = request->name();
115 |             shminfo.mid = request->mid();
116 |             shminfo.shm = shm;
117 |             shms.insert({key, shminfo});
118 |         }
119 |     }
120 |     return grpc::Status::OK;
121 | }
122 | 
123 | grpc::Status REEFServer::GetBlob(
124 |     grpc::ServerContext *context,
125 |     const reef::rpc::GetBlobRequest *request,
126 |     reef::rpc::GetBlobReply *reply
127 | ) {
128 |     SharedMemoryInfo shminfo;
129 |     {
130 |         std::unique_lock<std::mutex> lock(shm_mtx);
131 |         auto iter = shms.find(request->key());
132 |         if (iter == shms.end()) {
133 |             reply->set_succ(false);
134 |             return grpc::Status::OK;
135 |         }
136 |         shminfo = iter->second;
137 |     }
138 |     auto s = scheduler->get_output(shminfo.mid, shminfo.shm->data(), shminfo.shm->size(), shminfo.name);
139 |     if (s != Status::Succ) {
140 |         reply->set_succ(false);
141 |     } else {
142 |         reply->set_succ(true);
143 |     }
144 |     return grpc::Status::OK;
145 | }
146 | 
147 | grpc::Status REEFServer::SetBlob(
148 |     grpc::ServerContext *context,
149 |     const reef::rpc::SetBlobRequest *request,
150 |     reef::rpc::SetBlobReply *reply
151 | ) {
152 |     SharedMemoryInfo shminfo;
153 |     {
154 |         std::unique_lock<std::mutex> lock(shm_mtx);
155 |         auto iter = shms.find(request->key());
156 |         if (iter == shms.end()) {
157 |             reply->set_succ(false);
158 |             return grpc::Status::OK;
159 |         }
160 |         shminfo = iter->second;
161 |     }
162 |     auto s = scheduler->set_input(shminfo.mid, shminfo.shm->data(), shminfo.shm->size(), shminfo.name);
163 |     if (s != Status::Succ) {
164 |         reply->set_succ(false);
165 |     } else {
166 |         reply->set_succ(true);
167 |     }
168 |     return grpc::Status::OK;
169 | }
170 | 
171 | grpc::Status REEFServer::Infer(
172 |     grpc::ServerContext *context,
173 |     const reef::rpc::InferRequest *request,
174 |     reef::rpc::InferReply *reply
175 | ) {
176 |     REEFScheduler::TaskID tid;
177 |     auto s = scheduler->new_task(request->mid(), tid);
178 |     if (s != Status::Succ) {
179 |         reply->set_succ(false);
180 |     } else {
181 |         s = scheduler->wait_task(tid);
182 |         reply->set_succ(true);
183 |         if (s != Status::Succ)
184 |             reply->set_succ(false);
185 |         reply->set_tid(tid);
186 |     }
187 |     return grpc::Status::OK;
188 | }
189 | 
190 | 
191 | } // namespace server
192 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/server/server.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "reef/util/common.h"
 4 | #include "reef/util/shared_memory.h"
 5 | #include "reef/rpc/reef.grpc.pb.h"
 6 | #include "reef/server/scheduler.h"
 7 | #include <grpcpp/grpcpp.h>
 8 | 
 9 | namespace reef {
10 | namespace server {
11 | 
12 | 
13 | class REEFServer final : public reef::rpc::REEFService::Service {
14 | public:
15 |     REEFServer(const std::string& addr);
16 |     virtual ~REEFServer() {}
17 |     void run();
18 | 
19 |     void wait();
20 | 
21 |     void shutdown();
22 |     
23 |     REEFScheduler* get_scheduler() const {
24 |         return scheduler.get();
25 |     }
26 |     
27 | private: 
28 |     // RPC handles
29 |     grpc::Status SetPriority(
30 |         grpc::ServerContext *context,
31 |         const reef::rpc::SetPriorityRequest *request,
32 |         reef::rpc::SetPriorityReply *reply
33 |     ) override;
34 | 
35 |     grpc::Status LoadModel(
36 |         grpc::ServerContext *context,
37 |         const reef::rpc::LoadModelRequest *request,
38 |         reef::rpc::LoadModelReply *reply
39 |     ) override;
40 | 
41 |     grpc::Status RegisterBlob(
42 |         grpc::ServerContext *context,
43 |         const reef::rpc::RegisterBlobRequest *request,
44 |         reef::rpc::RegisterBlobReply *reply
45 |     ) override;
46 |     
47 |     grpc::Status GetBlob(
48 |         grpc::ServerContext *context,
49 |         const reef::rpc::GetBlobRequest *request,
50 |         reef::rpc::GetBlobReply *reply
51 |     ) override;
52 | 
53 |     grpc::Status SetBlob(
54 |         grpc::ServerContext *context,
55 |         const reef::rpc::SetBlobRequest *request,
56 |         reef::rpc::SetBlobReply *reply
57 |     ) override;
58 | 
59 |     grpc::Status Infer(
60 |         grpc::ServerContext *context,
61 |         const reef::rpc::InferRequest *request,
62 |         reef::rpc::InferReply *reply
63 |     ) override;
64 | 
65 | private:
66 |     std::string server_addr;
67 |     std::unique_ptr<grpc::Server> rpc_server;
68 |     std::unique_ptr<REEFScheduler> scheduler;
69 |     std::mutex shm_mtx;
70 |     struct SharedMemoryInfo {
71 |         std::string name;
72 |         std::shared_ptr<util::SharedMemory> shm;
73 |         REEFScheduler::ModelID mid;
74 |     };
75 |     std::unordered_map<std::string, SharedMemoryInfo> shms;
76 | };
77 | 
78 | } // namespace server 
79 | } // namespace reef


--------------------------------------------------------------------------------
/src/reef/util/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdlib.h>  
 4 | #include <glog/logging.h>
 5 | #include <iostream>
 6 | #include <memory>
 7 | 
 8 | #define DEFAULT_REEF_ADDR "localhost:34543"
 9 | 
10 | #ifndef RESOURCE_DIR
11 | #define RESOURCE_DIR "../resource"
12 | #endif
13 | 
14 | #define ASSERT(condition)\
15 |      do { \
16 |         if (! (condition)) { \
17 |             std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
18 |                       << ":" << __LINE__ << std::endl; \
19 |             std::terminate(); \
20 |         } \
21 |     } while (false)
22 | 
23 | #define ASSERT_STATUS(cmd) ASSERT(cmd == Status::Succ)
24 | 
25 | #define RETURN_STATUS(cmd) \
26 | {\
27 |     Status s = cmd;\
28 |     if (s != Status::Succ) {\
29 |         LOG(ERROR) << #cmd " error, " << __FILE__ << ":" << __LINE__; \
30 |         return s;\
31 |     }\
32 | }
33 | 
34 | #define ASSERT_MSG(condition, message) \
35 |     do { \
36 |         if (! (condition)) { \
37 |             std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
38 |                       << ":" << __LINE__ << " msg: " << message << std::endl; \
39 |             std::terminate(); \
40 |         } \
41 |     } while (false)
42 | 
43 | 
44 | namespace reef {
45 | 
46 | enum Status {
47 |     Succ,
48 |     Fail,
49 |     NotFound,
50 |     OutOfRange,
51 |     Full
52 | };
53 | 
54 | template <typename T>
55 | T align_up(T value, T alignment) {
56 |     T temp = value % alignment;
57 |     return temp == 0? value : value - temp + alignment;
58 | }
59 | 
60 | template <typename T>
61 | T align_down(T value, T alignment) {
62 |     return value - value % alignment;
63 | }
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/src/reef/util/json.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <iostream>
  3 | 
  4 | #include "reef/util/json.h"
  5 | 
  6 | namespace reef {
  7 | namespace util {
  8 | 
  9 | #define IS_DIGIT(chara) ((chara) <= '9' && (chara) >= '0')
 10 | #define IS_SPACE(chara) ((chara) == ' ' || (chara) == '\n' || (chara) == '\t')
 11 | 
 12 | #define MATCH_CONDITION(iter, str, delim) \
 13 |     ((iter) < (str).length() && ((str)[iter] != delim || (str)[iter-1] == '\\'))
 14 | 
 15 | #define TOKENIZE_SEPARATEOR(iter, type, sep, tokens) \
 16 |     case sep: \
 17 |         tokens.push_back(JsonParser::token(type)); \
 18 |         continue; 
 19 | 
 20 | #define TOKENIZE_COUPLED(iter, str, type, sep, tokens) \
 21 |     case sep: \
 22 |     { \
 23 |         size_t _tmp = iter + 1; \
 24 |         for (; MATCH_CONDITION(_tmp, str, sep); _tmp++) ; \
 25 |         tokens.push_back(JsonParser::token(type, str.substr(iter+1, _tmp-iter-1))); \
 26 |         iter = _tmp; \
 27 |         continue; \
 28 |     }
 29 | 
 30 | const char* token_name[] = {"invalid", "string", "number", "[", "]", "{", "}", ",", ":"};
 31 | 
 32 | 
 33 | JsonObject* JsonParser::parse(std::string& str) {
 34 |     int top = 0;
 35 |     JsonObject* jobj = _parse(tokenize(str), top);
 36 |     return jobj;
 37 | }
 38 | 
 39 | std::vector<JsonParser::token> JsonParser::tokenize(std::string& str) {
 40 |     std::vector<JsonParser::token> tokens;
 41 | 
 42 |     while (!str.empty()) {
 43 |         strip_space(str);
 44 |         std::string token = split_by_space(str);
 45 | 
 46 |         for (size_t i = 0; i < token.length(); i++) {
 47 |             switch (token[i]) {
 48 |                 TOKENIZE_COUPLED(i, token, STRING, '"', tokens);
 49 |                 TOKENIZE_COUPLED(i, token, STRING, '\'', tokens);
 50 |                 TOKENIZE_SEPARATEOR(i, COMMA, ',', tokens);
 51 |                 TOKENIZE_SEPARATEOR(i, LBRACKET, '[', tokens);
 52 |                 TOKENIZE_SEPARATEOR(i, RBRACKET, ']', tokens);
 53 |                 TOKENIZE_SEPARATEOR(i, LBRACE, '{', tokens);
 54 |                 TOKENIZE_SEPARATEOR(i, RBRACE, '}', tokens);
 55 |                 TOKENIZE_SEPARATEOR(i, COLON, ':', tokens);
 56 |             }
 57 | 
 58 |             if (token[i] == '-' || IS_DIGIT(token[i])) {
 59 |                 size_t tmp = (token[i] == '-') ? (i + 1) : i;
 60 |                 bool is_float = false;
 61 |                 for (; tmp < token.length() && (IS_DIGIT(token[tmp]) || token[tmp] == '.'); tmp++) 
 62 |                     if (token[tmp] == '.') is_float = true;
 63 |                 tokens.push_back(JsonParser::token(is_float ? FLOAT : INTEGER, token.substr(i, tmp-i)));
 64 |                 i = tmp - 1;
 65 |                 continue;
 66 |             }
 67 | 
 68 |             printf("Error: unrecognizable token at %s\n", token.substr(i).c_str());
 69 |             exit(1);
 70 |         }
 71 |     }
 72 | 
 73 |     return tokens;
 74 | }
 75 | 
 76 | JsonObject* JsonParser::_parse(std::vector<token> tokens, int& top) {
 77 |     JsonObject* cur = new JsonObject;
 78 | 
 79 |     switch (tokens[top].type) {
 80 |     case LBRACE:
 81 |         cur->type = JsonObject::J_DICT;
 82 |         top++;
 83 |         while (tokens[top].type != RBRACE) {
 84 |             assert(tokens[top].type == STRING);
 85 |             std::string key = tokens[top].value;
 86 | 
 87 |             assert(tokens[top+1].type == COLON);
 88 |             top += 2;
 89 | 
 90 |             cur->mval.insert(std::pair<std::string, JsonObject*>(key, _parse(tokens, top)));
 91 |             if (tokens[top].type == COMMA) top++;
 92 |         }
 93 |         top++;
 94 |         return cur;
 95 | 
 96 |     case LBRACKET:
 97 |         cur->type = JsonObject::J_LIST;
 98 |         top++;
 99 |         while (tokens[top].type != RBRACKET) {
100 |             cur->lval.push_back(_parse(tokens, top));
101 |             if (tokens[top].type == COMMA) top++;
102 |         }
103 |         top++;
104 |         return cur;
105 |     
106 |     case INTEGER:
107 |         cur->type = JsonObject::J_INT;
108 |         cur->ival = atoi(tokens[top].value.c_str());
109 |         top++;
110 |         return cur;
111 |     case FLOAT:
112 |         cur->type = JsonObject::J_FLOAT;
113 |         cur->fval = (float)atof(tokens[top].value.c_str());
114 |         top++;
115 |         return cur;
116 |     case STRING:
117 |         cur->type = JsonObject::J_STRING;
118 |         cur->sval = tokens[top].value;
119 |         top++;
120 |         return cur;
121 |     default:
122 |         break;
123 |     }
124 | 
125 |     return cur;
126 | }
127 | 
128 | void JsonParser::strip_space(std::string& str) {
129 | 
130 |     for (size_t i = 0; i < str.length(); i++) {
131 |         if (!IS_SPACE(str[i])) {
132 |             str.erase(0, i);
133 |             return;
134 |         }
135 |     }
136 | }
137 | 
138 | std::string JsonParser::split_by_space(std::string& str) {
139 |     std::string token;
140 |     
141 |     for (size_t i = 0; i < str.length(); i++) {
142 |         switch (str[i]) {
143 |             case '"':
144 |                 for (i++; MATCH_CONDITION(i, str, '"'); i++) ;
145 |                 break;
146 |             case '\'':
147 |                 for (i++; MATCH_CONDITION(i, str, '\''); i++) ;
148 |                 break;
149 |             case ' ':
150 |             case '\n':
151 |             case '\t':
152 |                 token = str.substr(0, i);
153 |                 str.erase(0, i);
154 |                 return token;
155 |             default:
156 |                 continue;
157 |         }
158 |     }
159 | 
160 |     token = str;
161 |     str.erase(0, str.length());
162 |     return token;
163 | }
164 | } // namespace util
165 | } // namespace reef
166 | 
167 | 


--------------------------------------------------------------------------------
/src/reef/util/json.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <map>
 6 | 
 7 | namespace reef {
 8 | namespace util {
 9 | 
10 | // A json object, which supports `string`, `int`, `float`, `list` and `dict(map)`.
11 | class JsonObject {
12 | public:
13 |     std::string sval;
14 |     uint32_t ival;
15 |     float fval;
16 |     std::vector<JsonObject*> lval;
17 |     std::map<std::string, JsonObject*> mval;
18 | 
19 |     enum jobject_type {J_STRING, J_INT, J_FLOAT, J_LIST, J_DICT};
20 |     jobject_type type;
21 | 
22 |     JsonObject() {}
23 | };
24 | 
25 | // A json parser that parses a string to a JsonObject.
26 | class JsonParser {
27 | public:
28 |     // parses a string to a JsonObject
29 |     static JsonObject* parse(std::string& str);
30 | 
31 | private:
32 |     enum token_type {INVAL, STRING, FLOAT, INTEGER, LBRACKET, RBRACKET, LBRACE, RBRACE, COMMA, COLON};
33 |     struct token {
34 |         token_type type;
35 |         std::string value;
36 |         token(token_type t, std::string v="") : type(t), value(v) {}
37 |     };
38 | 
39 |     static std::vector<token> tokenize(std::string& str);
40 |     static JsonObject* _parse(std::vector<token> tokens, int& top);
41 | 
42 |     static void strip_space(std::string& str);
43 |     static std::string split_by_space(std::string& str);
44 | };
45 | 
46 | } // namespace reef
47 | } // namespace util


--------------------------------------------------------------------------------
/src/reef/util/shared_memory.cpp:
--------------------------------------------------------------------------------
 1 | #include "reef/util/shared_memory.h"
 2 | #include "reef/util/common.h"
 3 | 
 4 | #include <sys/mman.h>
 5 | #include <fcntl.h>
 6 | #include <stdio.h>
 7 | #include <unistd.h>
 8 | #include <assert.h>
 9 | 
10 | namespace reef {
11 | namespace util {
12 | 
13 | SharedMemory::SharedMemory(
14 |     const std::string& __key, size_t __size, bool create
15 | ) : _key(__key), _size(__size), _create(create)
16 | {
17 |     _fd = shm_open(
18 |         __key.c_str(),
19 |         create ? O_CREAT|O_RDWR : O_RDWR,
20 |         0777
21 |     );
22 |     ASSERT(_fd >= 0);
23 |     ASSERT(ftruncate(_fd, _size) >= 0);
24 |     _data = mmap(NULL, _size, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, 0);
25 |     ASSERT(_data != nullptr);
26 | }
27 | 
28 | SharedMemory::~SharedMemory() {
29 |     close(_fd);
30 |     if (_create) {
31 |         shm_unlink(_key.c_str());
32 |     }
33 | }
34 | 
35 | void* SharedMemory::data() {
36 |     return _data;
37 | }
38 | 
39 | size_t SharedMemory::size() {
40 |     return _size;
41 | }
42 | 
43 | } // namespace util
44 | } // namespace reef 


--------------------------------------------------------------------------------
/src/reef/util/shared_memory.h:
--------------------------------------------------------------------------------
 1 | #pragma once 
 2 | 
 3 | #include <string>
 4 | 
 5 | namespace reef {
 6 | namespace util {
 7 | 
 8 | class SharedMemory {
 9 | public:
10 |     SharedMemory(const std::string& __key, size_t __size, bool create=false);
11 |     ~SharedMemory();
12 |     
13 |     void* data();
14 |     size_t size();
15 | 
16 | private:
17 |     int _fd;
18 |     std::string _key;
19 |     size_t _size;
20 |     void* _data;
21 |     bool _create;
22 | 
23 | }; 
24 | 
25 | 
26 | } // namespace util
27 | } // namespace reef 


--------------------------------------------------------------------------------
/src/reef/util/threadsafe_queue.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <atomic>
 4 | #include <mutex>
 5 | #include <assert.h>
 6 | 
 7 | 
 8 | namespace reef {
 9 | // TODO: replace it with a lock-free queue
10 | template<typename Element>
11 | class ThreadSafeQueue {
12 | public:
13 | 
14 |     enum { Capacity = 10000 };
15 | 
16 |     ThreadSafeQueue() : _tail(0), _head(0){
17 |         _array.resize(Capacity);
18 |     }   
19 | 
20 |     virtual ~ThreadSafeQueue() {
21 |     }
22 | 
23 |     ThreadSafeQueue(const ThreadSafeQueue &queue) = delete;
24 | 
25 |     ThreadSafeQueue(ThreadSafeQueue && queue) noexcept {
26 |         _tail.store(queue._tail.load());
27 |         _head.store(queue._head.load());
28 |         _array = std::move(queue._array);
29 |     } 
30 | 
31 |     /* Producer only: updates tail index after setting the element in place */
32 |     bool push(const Element& item)
33 |     {	
34 |         // quick fix: lock the producers
35 |         std::unique_lock<std::mutex> lock(mtx);
36 |         auto current_tail = _tail.load();            
37 |         auto next_tail = increment(current_tail);    
38 |         if(next_tail != _head.load())                         
39 |         {
40 |             _array[current_tail] = item;               
41 |             _tail.store(next_tail);                    
42 |             return true;
43 |         }
44 |         
45 |         return false;  // full queue
46 |     }
47 | 
48 |     /* Consumer only: updates head index after retrieving the element */
49 |     void pop()
50 |     {
51 |         std::unique_lock<std::mutex> lock(mtx);
52 |         const auto current_head = _head.load();  
53 |         assert(current_head != _tail.load());   // empty queue
54 |         _head.store(increment(current_head)); 
55 |     }
56 | 
57 |     Element& front()
58 |     {
59 |         std::unique_lock<std::mutex> lock(mtx);
60 |         const auto current_head = _head.load();  
61 |         assert(current_head != _tail.load());   // empty queue
62 |         auto &item = _array[current_head]; 
63 |         return item;
64 |     }
65 | 
66 |     bool empty() const {
67 |         // std::unique_lock<std::mutex> lock(mtx);
68 |         return (_head.load() == _tail.load());
69 |     }
70 | 
71 |     bool full() const
72 |     {
73 |         const auto next_tail = increment(_tail.load());
74 |         return (next_tail == _head.load());
75 |     }
76 | 
77 | private:
78 |     size_t increment(size_t idx) const
79 |     {
80 |         return (idx + 1) % Capacity;
81 |     }
82 |     std::atomic<size_t>  _tail;  
83 |     std::vector<Element> _array;
84 |     std::mutex mtx;
85 |     std::atomic<size_t>  _head; 
86 | };
87 | } // namespace reef


--------------------------------------------------------------------------------