├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── models ├── __init__.py └── vision │ ├── __init__.py │ ├── resnet.py │ └── utils.py ├── plan.py ├── plans └── V100 │ ├── bert_base │ └── README.md │ ├── bert_large │ └── README.md │ ├── gpt2 │ └── README.md │ ├── gpt2_medium │ └── README.md │ ├── resnet101 │ └── README.md │ ├── resnet50 │ └── README.md │ ├── roberta_base │ └── README.md │ └── roberta_large │ └── README.md ├── proto ├── deepcache.proto ├── deepplan.proto └── deepplan_pb2.py ├── pytorch.patch ├── requirements.txt ├── scripts ├── create_all_plans.sh ├── download_azure_trace_dataset.sh ├── fig10 │ ├── graph.py │ └── run.sh ├── fig12 │ ├── graph.py │ └── run.sh ├── fig13 │ ├── graph.py │ └── run.sh └── fig14 │ ├── graph.py │ └── run.sh ├── src ├── CMakeLists.txt ├── benchmark.cpp ├── client.cpp ├── client │ ├── azure.h │ ├── client.cpp │ ├── client.h │ ├── workload.cpp │ └── workload.h ├── deepplan │ ├── engine.cpp │ ├── engine.h │ ├── model.cpp │ └── model.h ├── network │ ├── message.h │ ├── network.cpp │ ├── network.h │ ├── server_api.cpp │ ├── server_api.h │ ├── session.cpp │ └── session.h ├── server.cpp ├── server │ ├── controller.cpp │ ├── controller.h │ ├── model_manager.cpp │ ├── model_manager.h │ ├── server.cpp │ ├── server.h │ ├── worker.cpp │ └── worker.h ├── server_api.h ├── util.cpp └── util.h └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | tags 2 | *.swp 3 | build 4 | model_repo/* 5 | __pycache__ 6 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | #add_compile_options(-std=c++) 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_C_STANDARD 11) 5 | 6 | project(DeepPlan) 7 | 8 | find_package(Torch REQUIRED) 9 | find_package(Boost COMPONENTS system filesystem REQUIRED) 10 | 11 | include_directories(${Boost_INCLUDE_DIR}) 12 | 13 | # Build protobuf v3.11.4 14 | find_package(Protobuf REQUIRED) 15 | include_directories(${Protobuf_INCLUDE_DIR}) 16 | 17 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 18 | protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS proto/deepplan.proto) 19 | add_library(deepplan_proto ${PROTO_HDRS} ${PROTO_SRCS}) 20 | target_link_libraries(deepplan_proto PRIVATE ${Protobuf_LIBRARIES}) 21 | 22 | protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS proto/deepcache.proto) 23 | add_library(deepcache_proto ${PROTO_HDRS} ${PROTO_SRCS}) 24 | target_link_libraries(deepcache_proto PRIVATE ${Protobuf_LIBRARIES}) 25 | 26 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 27 | #protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS src/deepplan/proto/deepplan.proto) 28 | #add_library(deepplan_proto ${PROTO_HDRS} ${PROTO_SRCS}) 29 | #target_link_libraries(deepplan_proto PRIVATE ${Protobuf_LIBRARIES}) 30 | 31 | add_subdirectory(src) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Computer Systems Laboratory @ Ajou University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepPlan 2 | 3 | Title: Fast and Efficient Model Serving Using Multi-GPUs with Direct-Host-Access 4 | 5 | ## 1.Experimental Environment 6 | ### 1.1 Hardware 7 | * AWS P3.8xlarge instance 8 | * GPU: NVIDIA V100 (16GB) x 4ea 9 | * Memory: 244GB DDR4 DRAM 10 | * CPU: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz 11 | * NVLink 2.0 12 | * PCIe 3.0 13 | 14 | For EuroSys '23 Artifact Evaluation Committee, we can provide the AWS instance we used if you don't have any machine that satisfies the requirements. Let us know through the HotCRP portal. 15 | 16 | ### 1.2 Software requirements 17 | * Operating system: Ubuntu 18.04 18 | * CUDA v11.3 19 | * CuDNN v8.2.1 20 | * ProtoBuf v3.11.4 21 | * Boost v1.65 22 | * TBB (Threading Building-Blocks) [v2017_U7](https://github.com/oneapi-src/oneTBB/tree/2017_U7) 23 | * PyTorch v1.9 24 | * Matplotlib v3.3.4 (for generating graphs) 25 | 26 | ## 2. Build software components 27 | 28 | ### 2.1 Dependent packages 29 | * build-essential 30 | ```bash 31 | $ sudo apt update 32 | $ sudo apt install build-essential 33 | ``` 34 | 35 | * C++ Library on Ubuntu 36 | ``` 37 | $ sudo apt-get install libtbb-dev libboost1.65-all-dev 38 | ``` 39 | 40 | * CUDA Toolkit v11.3 & CuDNN v8.2.1 41 | 42 | DeepPlan works with the PyTorch DL framework. To run PyTorch, 43 | we are supposed to install the dependent packages, CUDA and CuDNN. 44 | 45 | To install the CUDA Toolkit, see this link: [Download Installer for Linux Ubuntu 18.04 x86_64](https://developer.nvidia.com/cuda-11.3.0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=18.04&target_type=deb_local) 46 | 47 | To install the CuDNN Library, see this link: [Installation Guide](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html) and [CuDNN Archive](https://developer.nvidia.com/rdp/cudnn-archive) 48 | 49 | * ProtoBuf v3.11.4 50 | 51 | DeepPlan uses the ProtoBuf library to serialize or deserialize plans. 52 | So, ProtoBuf is required to build DeepPlan. To install ProtoBuf, see this 53 | following link: https://github.com/protocolbuffers/protobuf/blob/main/src/README.md 54 | 55 | ### 2.2 PyTorch 56 | To use DeepPlan, it is required to modify PyTorch (v1.9) framework. 57 | To simplify the step reflecting the code changes on the framework, we have provided a patch file for DeepPlan. 58 | The following command applies the patch to the PyTorch v1.9.0. 59 | 60 | ```bash 61 | $ cd $HOME 62 | $ # Let's first clone the DeepPlan repository and set the path 63 | $ git clone https://github.com/csl-ajou/DeepPlan/ 64 | $ DEEPPLAN_HOME=$HOME/DeepPlan 65 | $ 66 | $ # Let's download the PyTorch v1.9.0 package and set the path 67 | $ git clone --recursive https://github.com/pytorch/pytorch -b v1.9.0 68 | $ PYTORCH_HOME=$HOME/pytorch 69 | $ 70 | $ cd $PYTORCH_HOME 71 | $ patch -p1 < $DEEPPLAN_HOME/pytorch.patch 72 | ``` 73 | 74 | After applying the patch file, let's compile the PyTorch. 75 | 76 | ```bash 77 | $ python3 setup.py install 78 | ``` 79 | 80 | In addition to PyTorch, install pip modules using the command below, from DeepPlan's `Home` directory. 81 | ```bash 82 | $ cd $DEEPPLAN_HOME 83 | $ pip3 install -r requirements.txt 84 | ``` 85 | 86 | ### 2.3 DeepPlan 87 | 88 | After successfully patching and building the PyTorch framework, we are 89 | ready to build DeepPlan to generate inference execution plans and 90 | the DL server prototype. 91 | 92 | ```bash 93 | $ cd $DEEPPLAN_HOME 94 | $ mkdir build 95 | $ cd build 96 | $ cmake -DCMAKE_PREFIX_PATH=$PYTORCH_HOME .. 97 | $ make 98 | ``` 99 | 100 | ## 3. Setup execution plans 101 | 102 | You need to create a plan for a given model. In this tutorial, our target is ResNet50. 103 | The python module, `plan.py`, already imports the pre-trained models evaluated in the paper so that you can simply type the name of the model. 104 | ```bash 105 | # Create Plan 106 | $ cd $DEEPPLAN_HOME 107 | $ mkdir -p plan_repo 108 | $ python3 plan.py -m resnet50 -p plan_repo 109 | # The generated plan from this command is saved the plans directory 110 | ``` 111 | 112 | If you want to take a look at generated plans (Table 3 in the paper), you can click the following links. 113 | 114 | * [Plans](https://github.com/csl-ajou/DeepPlan/tree/main/plans/V100) 115 | 116 | 117 | ## 4. Run benchmarks 118 | Once DeepPlan generate the execution plan for a given model, you can run the model inference with the DeepPlan engine through the commands below, from DeepPlan's `Home` directory. 119 | Here, we have an example for ResNet50. In this section, we describe how to run four different execution methods, 120 | Baseline (on-demand), PipeSwitch, DeepPlan (DHA), DeepPlan (PT), and DeepPlan (PT+DHA), explained in our paper. 121 | 122 | Before running the model inference, you have to set `PLAN_REPO` environment variable which represents where plans are stored. 123 | 124 | ```bash 125 | # The plan repository should be the same as the path specified in above creating a plan 126 | $ export PLAN_REPO=$DEEPPLAN_HOME/plan_repo 127 | $ cd $DEEPPLAN_HOME 128 | ``` 129 | 130 | * Baseline (on-demand) 131 | 132 | ```bash 133 | $ ./build/benchmark -m resnet50 -e demand 134 | ``` 135 | You should see output similar to the following: 136 | ```bash 137 | Benchmarking Inference resnet50 138 | Average Latency : 17.7038 ms 139 | ``` 140 | 141 | * PipeSwtich (Bai et al. OSDI 2020) 142 | 143 | ```bash 144 | $ ./build/benchmark -m resnet50 -e pipeline 145 | ``` 146 | 147 | You should see output similar to the following: 148 | ```bash 149 | Benchmarking Inference resnet50 150 | Average Latency : 11.981 ms 151 | ``` 152 | 153 | * DeepPlan (DHA) 154 | 155 | ```bash 156 | $ ./build/benchmark -m resnet50 -e deepplan 157 | ``` 158 | You should see output similar to the following: 159 | ```bash 160 | Benchmarking Inference renset50 161 | Average Latency : 11.2345 ms 162 | ``` 163 | 164 | * DeepPlan (PT) 165 | 166 | ```bash 167 | $ ./build/benchmark -m resnet50 -e pipeline -d 0 2 # d option represents the devices to be used for load 168 | ``` 169 | You should see output similar to the following: 170 | ```bash 171 | Benchmarking Inference renset50 172 | Average Latency : 9.39064 ms 173 | ``` 174 | 175 | * DeepPlan (DHA+PT) 176 | 177 | ```bash 178 | $ ./build/benchmark -m resnet50 -e deepplan -d 0 2 # d option represents the devices to be used for load 179 | ``` 180 | You should see output similar to the following: 181 | ```bash 182 | Benchmarking Inference renset101 183 | Average Latency : 8.36423 ms 184 | ``` 185 | 186 | ## 5. Reproduce results in the paper 187 | To reproduce the experimental results presented in the paper, we should have the model plans. To simplify creating model plans, 188 | we provide `create_all_plans.sh` shell script that makes all model plans used in the experiments. 189 | 190 | ```bash 191 | $ cd $DEEPPLAN_HOME/scripts 192 | $ mkdir -p $DEEPPLAN_HOME/plan_repo/V100 193 | $ export PLAN_REPO=$DEEPPLAN_HOME/plan_repo/V100 194 | $ source create_all_plans.sh # the plan repository is created in PLAN_REPO path. 195 | ``` 196 | For all shell scripts, we should setup `PLAN_REPO` variable which represents plans repository. 197 | We provided experiments scripts for figure #10, #12, #13, and #14. 198 | Run the script in the `$DEEPPLAN_HOME/scripts/fig#/run.sh` directory and the result will be logged in 199 | the same directory. If the Matplotlib library was installed in your machine, 200 | the graph will be drawn in `fig#.pdf`. 201 | 202 | ### 5.1 Figure 10: Performance comparison of DeepPlan and previous studies 203 | We evaluate the inference latency with a single batch for On-Demand, PipeSwitch, DeepPlan(DHA), 204 | DeepPlan (PT), and DeepPlan (PT+DHA). The results are normalized to Baseline (on-demand). 205 | 206 | ```bash 207 | $ cd $DEEPPLAN_HOME/scripts/fig10 208 | $ source run.sh 209 | ``` 210 | 211 | ### 5.2 Figure 12: 99% latency, goodput, and cold-start rate for BERT-Base (Synthetic workloads) 212 | We perform this experiment on a four-GPU server in an AWS instance. 213 | This experiment measures the 99% latency, goodput, and cold-start for BERT-Base 214 | while increasing the number of model instances concurrently running on the GPUs. 215 | 216 | ```bash 217 | $ cd $DEEPPLAN_HOME/scripts/fig12 218 | $ source run.sh 219 | ``` 220 | 221 | ### 5.3 Figure 13: 99% latency for BERT-Large and GPT2 (Synthetic workloads) 222 | This experiment is similar to above the experiment (Figure 12) except that 223 | the evaluation model is changed from BERT-Base to Bert-Large and GPT2. 224 | ```bash 225 | $ cd $DEEPPLAN_HOME/scripts/fig13 226 | $ source run.sh 227 | ``` 228 | 229 | ### 5.4 Figure 14: Performance of real-world trace (Real-world workloads) 230 | This experiment is also performed on a four-GPU server in an AWS instance. 231 | The above experiments (Figure 12, Figure 13) run with synthetic trace. But 232 | this experiment run with real-world trace derived from Microsoft Azure Functions. 233 | In this experiment, we evaluate three workloads of three hours each (total 9 hours). 234 | 235 | To run this experiment, you should prepare azure trace dataset. 236 | https://github.com/Azure/AzurePublicDataset/blob/master/AzureFunctionsDataset2019.md 237 | 238 | The following command download the azure-trace dataset. 239 | ```bash 240 | $ cd $DEEPPLAN_HOME/scripts 241 | $ source download_azure_trace_dataset.sh 242 | 243 | # To recognize this trace file from client, The `AZURE_TRACE_DIR` variable should be set 244 | $ export AZURE_TRACE_DIR=$DEEPPLAN_HOME/scripts/azure-functions 245 | ``` 246 | 247 | ```bash 248 | $ cd $DEEPPLAN_HOME/scripts/fig14 249 | $ source run.sh 250 | ``` 251 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from . import vision 3 | from transformers import AutoModel 4 | from enum import Enum 5 | from typing import Union, List 6 | 7 | class ModelType(Enum): 8 | CNN = 0 9 | TRANSFORMER = 1 10 | 11 | class ModelConfig: 12 | def __init__( 13 | self, 14 | model_name: str, 15 | model_type: Union[str], 16 | input_shape: List[int], 17 | generate_input_func, 18 | max_num: int = -1, 19 | num_layers: int = 0, 20 | ): 21 | self.model_name = model_name 22 | 23 | if model_type in ("CNN", ModelType.CNN): 24 | self.model_type = ModelType.CNN 25 | elif model_type in ("Transformer", ModelType.TRANSFORMER): 26 | self.model_type = ModelType.TRANSFORMER 27 | 28 | self.input_shape = input_shape 29 | self.generate_input_func = generate_input_func 30 | self.max_num = max_num 31 | self.num_layers = num_layers 32 | 33 | 34 | def generateInput(self, batch_size): 35 | t = self.generate_input_func(batch_size, self.input_shape, self.max_num) 36 | return t 37 | 38 | def generateModel(self): 39 | model = None 40 | if self.model_type == ModelType.CNN: 41 | model = getattr(vision, self.model_name)() 42 | elif self.model_type == ModelType.TRANSFORMER: 43 | model = AutoModel.from_pretrained(self.model_name, torchscript=True) 44 | model.num_layers = self.num_layers 45 | model.is_parallel = False 46 | 47 | return model 48 | 49 | 50 | def generate_CNN_input(batch_size, shape, max_num=0): 51 | input_shape = [batch_size] + shape 52 | x = torch.randn(input_shape) 53 | return x 54 | 55 | def generate_TRS_input(batch_size, shape, max_num): 56 | input_shape = [batch_size] + shape 57 | x = torch.randint(max_num, input_shape, dtype=torch.long) 58 | return x 59 | 60 | def generate_T5_input(batch_size, shape, max_num): 61 | x = generate_TRS_input(batch_size, shape, max_num) 62 | return {"input_ids": x, "decoder_input_ids": x} 63 | 64 | 65 | model_list = { 66 | 'resnet50': ModelConfig('resnet50', ModelType.CNN, [3, 224, 224], generate_CNN_input), 67 | 'resnet101': ModelConfig('resnet101', ModelType.CNN, [3, 224, 224], generate_CNN_input), 68 | 'resnext50': ModelConfig('resnext50_32x4d', ModelType.CNN, [3, 224, 224], generate_CNN_input), 69 | 'resnext101': ModelConfig('resnext101_32x8d', ModelType.CNN, [3, 224, 224], generate_CNN_input), 70 | 'wide_resnet50': ModelConfig('wide_resnet50_2', ModelType.CNN, [3, 224, 224], generate_CNN_input), 71 | 'wide_resnet101': ModelConfig('wide_resnet101_2', ModelType.CNN, [3, 224, 224], generate_CNN_input), 72 | 'bert_base': ModelConfig('bert-base-uncased', ModelType.TRANSFORMER, [384], generate_TRS_input, max_num=30522, num_layers=12), 73 | 'bert_large': ModelConfig('bert-large-uncased', ModelType.TRANSFORMER, [384], generate_TRS_input, max_num=30522, num_layers=24), 74 | 'gpt2': ModelConfig('gpt2', ModelType.TRANSFORMER, [1024], generate_TRS_input, max_num=50257, num_layers=12), 75 | 'gpt2_384': ModelConfig('gpt2', ModelType.TRANSFORMER, [384], generate_TRS_input, max_num=50257, num_layers=12), 76 | 'gpt2_medium': ModelConfig('gpt2-medium', ModelType.TRANSFORMER, [1024], generate_TRS_input, max_num=50257, num_layers=24), 77 | 'bart_base': ModelConfig('facebook/bart-base', ModelType.TRANSFORMER, [384], generate_TRS_input, max_num=50265), 78 | 't5_base': ModelConfig('t5-base', ModelType.TRANSFORMER, [300], generate_T5_input, max_num=32129, num_layers=12), 79 | 't5_small': ModelConfig('t5-small', ModelType.TRANSFORMER, [300], generate_T5_input, max_num=32129, num_layers=6), 80 | 't5_large': ModelConfig('t5-large', ModelType.TRANSFORMER, [300], generate_T5_input, max_num=32129, num_layers=24), 81 | 'roberta_base': ModelConfig('roberta-base', ModelType.TRANSFORMER, [384], generate_TRS_input, max_num=50265, num_layers=12), 82 | 'roberta_large': ModelConfig('roberta-large', ModelType.TRANSFORMER, [384], generate_TRS_input, max_num=50265, num_layers=24), 83 | } 84 | 85 | def import_model(model_name): 86 | model = None 87 | 88 | if model_name in model_list: 89 | model = model_list[model_name].generateModel() 90 | else: 91 | raise RuntimeError(f"[Error] Not found '{model_name}' model") 92 | 93 | return model 94 | 95 | def import_data(model_name, batch_size): 96 | data = None 97 | 98 | if model_name in model_list: 99 | data = model_list[model_name].generateInput(batch_size) 100 | else: 101 | raise RuntimeError(f"[Error] Not found '{model_name}' model") 102 | 103 | return data 104 | -------------------------------------------------------------------------------- /models/vision/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import * 2 | -------------------------------------------------------------------------------- /models/vision/utils.py: -------------------------------------------------------------------------------- 1 | try: 2 | from torch.hub import load_state_dict_from_url 3 | except ImportError: 4 | from torch.utils.model_zoo import load_url as load_state_dict_from_url 5 | -------------------------------------------------------------------------------- /plans/V100/bert_base/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Embedding (89.420 MB) |X (direct-host-access) |X (direct-host-access) 4 | |1-Embedding (1.500 MB) |X (direct-host-access) |X (direct-host-access) 5 | |2-Embedding (0.006 MB) |X (direct-host-access) |O 6 | |3-LayerNorm (0.006 MB) |O |O 7 | |4-Linear (2.253 MB) |O |O 8 | |5-Linear (2.253 MB) |O |O 9 | |6-Linear (2.253 MB) |O |O 10 | |7-Linear (2.253 MB) |O |O 11 | |8-LayerNorm (0.006 MB) |O |O 12 | |9-Linear (9.012 MB) |O |O 13 | |10-Linear (9.003 MB) |O |O 14 | |11-LayerNorm (0.006 MB) |O |O 15 | |12-Linear (2.253 MB) |O |O 16 | |13-Linear (2.253 MB) |O |O 17 | |14-Linear (2.253 MB) |O |O 18 | |15-Linear (2.253 MB) |O |O 19 | |16-LayerNorm (0.006 MB) |O |O 20 | |17-Linear (9.012 MB) |O |O 21 | |18-Linear (9.003 MB) |O |O 22 | |19-LayerNorm (0.006 MB) |O |O 23 | |20-Linear (2.253 MB) |O |O 24 | |21-Linear (2.253 MB) |O |O 25 | |22-Linear (2.253 MB) |O |O 26 | |23-Linear (2.253 MB) |O |O 27 | |24-LayerNorm (0.006 MB) |O |O 28 | |25-Linear (9.012 MB) |O |O 29 | |26-Linear (9.003 MB) |O |O 30 | |27-LayerNorm (0.006 MB) |O |O 31 | |28-Linear (2.253 MB) |O |O 32 | |29-Linear (2.253 MB) |O |O 33 | |30-Linear (2.253 MB) |O |O 34 | |31-Linear (2.253 MB) |O |O 35 | |32-LayerNorm (0.006 MB) |O |O 36 | |33-Linear (9.012 MB) |O |O 37 | |34-Linear (9.003 MB) |O |O 38 | |35-LayerNorm (0.006 MB) |O |O 39 | |36-Linear (2.253 MB) |O |O 40 | |37-Linear (2.253 MB) |O |O 41 | |38-Linear (2.253 MB) |O |O 42 | |39-Linear (2.253 MB) |O |O 43 | |40-LayerNorm (0.006 MB) |O |O 44 | |41-Linear (9.012 MB) |O |O 45 | |42-Linear (9.003 MB) |O |O 46 | |43-LayerNorm (0.006 MB) |O |O 47 | |44-Linear (2.253 MB) |O |O 48 | |45-Linear (2.253 MB) |O |O 49 | |46-Linear (2.253 MB) |O |O 50 | |47-Linear (2.253 MB) |O |O 51 | |48-LayerNorm (0.006 MB) |O |O 52 | |49-Linear (9.012 MB) |O |O 53 | |50-Linear (9.003 MB) |O |O 54 | |51-LayerNorm (0.006 MB) |O |O 55 | |52-Linear (2.253 MB) |O |O 56 | |53-Linear (2.253 MB) |O |O 57 | |54-Linear (2.253 MB) |O |O 58 | |55-Linear (2.253 MB) |O |O 59 | |56-LayerNorm (0.006 MB) |O |O 60 | |57-Linear (9.012 MB) |O |O 61 | |58-Linear (9.003 MB) |O |O 62 | |59-LayerNorm (0.006 MB) |O |O 63 | |60-Linear (2.253 MB) |O |O 64 | |61-Linear (2.253 MB) |O |O 65 | |62-Linear (2.253 MB) |O |O 66 | |63-Linear (2.253 MB) |O |O 67 | |64-LayerNorm (0.006 MB) |O |O 68 | |65-Linear (9.012 MB) |O |O 69 | |66-Linear (9.003 MB) |O |O 70 | |67-LayerNorm (0.006 MB) |O |O 71 | |68-Linear (2.253 MB) |O |O 72 | |69-Linear (2.253 MB) |O |O 73 | |70-Linear (2.253 MB) |O |O 74 | |71-Linear (2.253 MB) |O |O 75 | |72-LayerNorm (0.006 MB) |O |O 76 | |73-Linear (9.012 MB) |O |O 77 | |74-Linear (9.003 MB) |O |O 78 | |75-LayerNorm (0.006 MB) |O |O 79 | |76-Linear (2.253 MB) |O |O 80 | |77-Linear (2.253 MB) |O |O 81 | |78-Linear (2.253 MB) |O |O 82 | |79-Linear (2.253 MB) |O |O 83 | |80-LayerNorm (0.006 MB) |O |O 84 | |81-Linear (9.012 MB) |O |O 85 | |82-Linear (9.003 MB) |O |O 86 | |83-LayerNorm (0.006 MB) |O |O 87 | |84-Linear (2.253 MB) |O |O 88 | |85-Linear (2.253 MB) |O |O 89 | |86-Linear (2.253 MB) |O |O 90 | |87-Linear (2.253 MB) |O |O 91 | |88-LayerNorm (0.006 MB) |O |O 92 | |89-Linear (9.012 MB) |O |O 93 | |90-Linear (9.003 MB) |O |O 94 | |91-LayerNorm (0.006 MB) |O |O 95 | |92-Linear (2.253 MB) |O |O 96 | |93-Linear (2.253 MB) |O |O 97 | |94-Linear (2.253 MB) |O |O 98 | |95-Linear (2.253 MB) |O |O 99 | |96-LayerNorm (0.006 MB) |O |O 100 | |97-Linear (9.012 MB) |O |O 101 | |98-Linear (9.003 MB) |O |O 102 | |99-LayerNorm (0.006 MB) |O |O 103 | |100-Linear (2.253 MB) |X (direct-host-access) |O 104 | |101-Tanh (0.000 MB) |X |X 105 | -------------------------------------------------------------------------------- /plans/V100/bert_large/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Embedding (119.227 MB) |X (direct-host-access) |X (direct-host-access) 4 | |1-Embedding (2.000 MB) |X (direct-host-access) |O 5 | |2-Embedding (0.008 MB) |X (direct-host-access) |O 6 | |3-LayerNorm (0.008 MB) |O |O 7 | |4-Linear (4.004 MB) |O |O 8 | |5-Linear (4.004 MB) |O |O 9 | |6-Linear (4.004 MB) |O |O 10 | |7-Linear (4.004 MB) |O |O 11 | |8-LayerNorm (0.008 MB) |O |O 12 | |9-Linear (16.016 MB) |O |O 13 | |10-Linear (16.004 MB) |O |O 14 | |11-LayerNorm (0.008 MB) |O |O 15 | |12-Linear (4.004 MB) |O |O 16 | |13-Linear (4.004 MB) |O |O 17 | |14-Linear (4.004 MB) |O |O 18 | |15-Linear (4.004 MB) |O |O 19 | |16-LayerNorm (0.008 MB) |O |O 20 | |17-Linear (16.016 MB) |O |O 21 | |18-Linear (16.004 MB) |O |O 22 | |19-LayerNorm (0.008 MB) |O |O 23 | |20-Linear (4.004 MB) |O |O 24 | |21-Linear (4.004 MB) |O |O 25 | |22-Linear (4.004 MB) |O |O 26 | |23-Linear (4.004 MB) |O |O 27 | |24-LayerNorm (0.008 MB) |O |O 28 | |25-Linear (16.016 MB) |O |O 29 | |26-Linear (16.004 MB) |O |O 30 | |27-LayerNorm (0.008 MB) |O |O 31 | |28-Linear (4.004 MB) |O |O 32 | |29-Linear (4.004 MB) |O |O 33 | |30-Linear (4.004 MB) |O |O 34 | |31-Linear (4.004 MB) |O |O 35 | |32-LayerNorm (0.008 MB) |O |O 36 | |33-Linear (16.016 MB) |O |O 37 | |34-Linear (16.004 MB) |O |O 38 | |35-LayerNorm (0.008 MB) |O |O 39 | |36-Linear (4.004 MB) |O |O 40 | |37-Linear (4.004 MB) |O |O 41 | |38-Linear (4.004 MB) |O |O 42 | |39-Linear (4.004 MB) |O |O 43 | |40-LayerNorm (0.008 MB) |O |O 44 | |41-Linear (16.016 MB) |O |O 45 | |42-Linear (16.004 MB) |O |O 46 | |43-LayerNorm (0.008 MB) |O |O 47 | |44-Linear (4.004 MB) |O |O 48 | |45-Linear (4.004 MB) |O |O 49 | |46-Linear (4.004 MB) |O |O 50 | |47-Linear (4.004 MB) |O |O 51 | |48-LayerNorm (0.008 MB) |O |O 52 | |49-Linear (16.016 MB) |O |O 53 | |50-Linear (16.004 MB) |O |O 54 | |51-LayerNorm (0.008 MB) |O |O 55 | |52-Linear (4.004 MB) |O |O 56 | |53-Linear (4.004 MB) |O |O 57 | |54-Linear (4.004 MB) |O |O 58 | |55-Linear (4.004 MB) |O |O 59 | |56-LayerNorm (0.008 MB) |O |O 60 | |57-Linear (16.016 MB) |O |O 61 | |58-Linear (16.004 MB) |O |O 62 | |59-LayerNorm (0.008 MB) |O |O 63 | |60-Linear (4.004 MB) |O |O 64 | |61-Linear (4.004 MB) |O |O 65 | |62-Linear (4.004 MB) |O |O 66 | |63-Linear (4.004 MB) |O |O 67 | |64-LayerNorm (0.008 MB) |O |O 68 | |65-Linear (16.016 MB) |O |O 69 | |66-Linear (16.004 MB) |O |O 70 | |67-LayerNorm (0.008 MB) |O |O 71 | |68-Linear (4.004 MB) |O |O 72 | |69-Linear (4.004 MB) |O |O 73 | |70-Linear (4.004 MB) |O |O 74 | |71-Linear (4.004 MB) |O |O 75 | |72-LayerNorm (0.008 MB) |O |O 76 | |73-Linear (16.016 MB) |O |O 77 | |74-Linear (16.004 MB) |O |O 78 | |75-LayerNorm (0.008 MB) |O |O 79 | |76-Linear (4.004 MB) |O |O 80 | |77-Linear (4.004 MB) |O |O 81 | |78-Linear (4.004 MB) |O |O 82 | |79-Linear (4.004 MB) |O |O 83 | |80-LayerNorm (0.008 MB) |O |O 84 | |81-Linear (16.016 MB) |O |O 85 | |82-Linear (16.004 MB) |O |O 86 | |83-LayerNorm (0.008 MB) |O |O 87 | |84-Linear (4.004 MB) |O |O 88 | |85-Linear (4.004 MB) |O |O 89 | |86-Linear (4.004 MB) |O |O 90 | |87-Linear (4.004 MB) |O |O 91 | |88-LayerNorm (0.008 MB) |O |O 92 | |89-Linear (16.016 MB) |O |O 93 | |90-Linear (16.004 MB) |O |O 94 | |91-LayerNorm (0.008 MB) |O |O 95 | |92-Linear (4.004 MB) |O |O 96 | |93-Linear (4.004 MB) |O |O 97 | |94-Linear (4.004 MB) |O |O 98 | |95-Linear (4.004 MB) |O |O 99 | |96-LayerNorm (0.008 MB) |O |O 100 | |97-Linear (16.016 MB) |O |O 101 | |98-Linear (16.004 MB) |O |O 102 | |99-LayerNorm (0.008 MB) |O |O 103 | |100-Linear (4.004 MB) |O |O 104 | |101-Linear (4.004 MB) |O |O 105 | |102-Linear (4.004 MB) |O |O 106 | |103-Linear (4.004 MB) |O |O 107 | |104-LayerNorm (0.008 MB) |O |O 108 | |105-Linear (16.016 MB) |O |O 109 | |106-Linear (16.004 MB) |O |O 110 | |107-LayerNorm (0.008 MB) |O |O 111 | |108-Linear (4.004 MB) |O |O 112 | |109-Linear (4.004 MB) |O |O 113 | |110-Linear (4.004 MB) |O |O 114 | |111-Linear (4.004 MB) |O |O 115 | |112-LayerNorm (0.008 MB) |O |O 116 | |113-Linear (16.016 MB) |O |O 117 | |114-Linear (16.004 MB) |O |O 118 | |115-LayerNorm (0.008 MB) |O |O 119 | |116-Linear (4.004 MB) |O |O 120 | |117-Linear (4.004 MB) |O |O 121 | |118-Linear (4.004 MB) |O |O 122 | |119-Linear (4.004 MB) |O |O 123 | |120-LayerNorm (0.008 MB) |O |O 124 | |121-Linear (16.016 MB) |O |O 125 | |122-Linear (16.004 MB) |O |O 126 | |123-LayerNorm (0.008 MB) |O |O 127 | |124-Linear (4.004 MB) |O |O 128 | |125-Linear (4.004 MB) |O |O 129 | |126-Linear (4.004 MB) |O |O 130 | |127-Linear (4.004 MB) |O |O 131 | |128-LayerNorm (0.008 MB) |O |O 132 | |129-Linear (16.016 MB) |O |O 133 | |130-Linear (16.004 MB) |O |O 134 | |131-LayerNorm (0.008 MB) |O |O 135 | |132-Linear (4.004 MB) |O |O 136 | |133-Linear (4.004 MB) |O |O 137 | |134-Linear (4.004 MB) |O |O 138 | |135-Linear (4.004 MB) |O |O 139 | |136-LayerNorm (0.008 MB) |O |O 140 | |137-Linear (16.016 MB) |O |O 141 | |138-Linear (16.004 MB) |O |O 142 | |139-LayerNorm (0.008 MB) |O |O 143 | |140-Linear (4.004 MB) |O |O 144 | |141-Linear (4.004 MB) |O |O 145 | |142-Linear (4.004 MB) |O |O 146 | |143-Linear (4.004 MB) |O |O 147 | |144-LayerNorm (0.008 MB) |O |O 148 | |145-Linear (16.016 MB) |O |O 149 | |146-Linear (16.004 MB) |O |O 150 | |147-LayerNorm (0.008 MB) |O |O 151 | |148-Linear (4.004 MB) |O |O 152 | |149-Linear (4.004 MB) |O |O 153 | |150-Linear (4.004 MB) |O |O 154 | |151-Linear (4.004 MB) |O |O 155 | |152-LayerNorm (0.008 MB) |O |O 156 | |153-Linear (16.016 MB) |O |O 157 | |154-Linear (16.004 MB) |O |O 158 | |155-LayerNorm (0.008 MB) |O |O 159 | |156-Linear (4.004 MB) |O |O 160 | |157-Linear (4.004 MB) |O |O 161 | |158-Linear (4.004 MB) |O |O 162 | |159-Linear (4.004 MB) |O |O 163 | |160-LayerNorm (0.008 MB) |O |O 164 | |161-Linear (16.016 MB) |O |O 165 | |162-Linear (16.004 MB) |O |O 166 | |163-LayerNorm (0.008 MB) |O |O 167 | |164-Linear (4.004 MB) |O |O 168 | |165-Linear (4.004 MB) |O |O 169 | |166-Linear (4.004 MB) |O |O 170 | |167-Linear (4.004 MB) |O |O 171 | |168-LayerNorm (0.008 MB) |O |O 172 | |169-Linear (16.016 MB) |O |O 173 | |170-Linear (16.004 MB) |O |O 174 | |171-LayerNorm (0.008 MB) |O |O 175 | |172-Linear (4.004 MB) |O |O 176 | |173-Linear (4.004 MB) |O |O 177 | |174-Linear (4.004 MB) |O |O 178 | |175-Linear (4.004 MB) |O |O 179 | |176-LayerNorm (0.008 MB) |O |O 180 | |177-Linear (16.016 MB) |O |O 181 | |178-Linear (16.004 MB) |O |O 182 | |179-LayerNorm (0.008 MB) |O |O 183 | |180-Linear (4.004 MB) |O |O 184 | |181-Linear (4.004 MB) |O |O 185 | |182-Linear (4.004 MB) |O |O 186 | |183-Linear (4.004 MB) |O |O 187 | |184-LayerNorm (0.008 MB) |O |O 188 | |185-Linear (16.016 MB) |O |O 189 | |186-Linear (16.004 MB) |O |O 190 | |187-LayerNorm (0.008 MB) |O |O 191 | |188-Linear (4.004 MB) |O |O 192 | |189-Linear (4.004 MB) |O |O 193 | |190-Linear (4.004 MB) |O |O 194 | |191-Linear (4.004 MB) |O |O 195 | |192-LayerNorm (0.008 MB) |O |O 196 | |193-Linear (16.016 MB) |O |O 197 | |194-Linear (16.004 MB) |O |O 198 | |195-LayerNorm (0.008 MB) |O |O 199 | |196-Linear (4.004 MB) |O |O 200 | |197-Tanh (0.000 MB) |X |X 201 | -------------------------------------------------------------------------------- /plans/V100/gpt2/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Embedding (147.237 MB) |X (direct-host-access) |X (direct-host-access) 4 | |1-Embedding (3.000 MB) |X (direct-host-access) |O 5 | |2-LayerNorm (0.006 MB) |O |O 6 | |3-Conv1D (6.759 MB) |O |O 7 | |4-Conv1D (2.253 MB) |O |O 8 | |5-LayerNorm (0.006 MB) |O |O 9 | |6-Conv1D (9.012 MB) |O |O 10 | |7-Conv1D (9.003 MB) |O |O 11 | |8-LayerNorm (0.006 MB) |O |O 12 | |9-Conv1D (6.759 MB) |O |O 13 | |10-Conv1D (2.253 MB) |O |O 14 | |11-LayerNorm (0.006 MB) |O |O 15 | |12-Conv1D (9.012 MB) |O |O 16 | |13-Conv1D (9.003 MB) |O |O 17 | |14-LayerNorm (0.006 MB) |O |O 18 | |15-Conv1D (6.759 MB) |O |O 19 | |16-Conv1D (2.253 MB) |O |O 20 | |17-LayerNorm (0.006 MB) |O |O 21 | |18-Conv1D (9.012 MB) |O |O 22 | |19-Conv1D (9.003 MB) |O |O 23 | |20-LayerNorm (0.006 MB) |O |O 24 | |21-Conv1D (6.759 MB) |O |O 25 | |22-Conv1D (2.253 MB) |O |O 26 | |23-LayerNorm (0.006 MB) |O |O 27 | |24-Conv1D (9.012 MB) |O |O 28 | |25-Conv1D (9.003 MB) |O |O 29 | |26-LayerNorm (0.006 MB) |O |O 30 | |27-Conv1D (6.759 MB) |O |O 31 | |28-Conv1D (2.253 MB) |O |O 32 | |29-LayerNorm (0.006 MB) |O |O 33 | |30-Conv1D (9.012 MB) |O |O 34 | |31-Conv1D (9.003 MB) |O |O 35 | |32-LayerNorm (0.006 MB) |O |O 36 | |33-Conv1D (6.759 MB) |O |O 37 | |34-Conv1D (2.253 MB) |O |O 38 | |35-LayerNorm (0.006 MB) |O |O 39 | |36-Conv1D (9.012 MB) |O |O 40 | |37-Conv1D (9.003 MB) |O |O 41 | |38-LayerNorm (0.006 MB) |O |O 42 | |39-Conv1D (6.759 MB) |O |O 43 | |40-Conv1D (2.253 MB) |O |O 44 | |41-LayerNorm (0.006 MB) |O |O 45 | |42-Conv1D (9.012 MB) |O |O 46 | |43-Conv1D (9.003 MB) |O |O 47 | |44-LayerNorm (0.006 MB) |O |O 48 | |45-Conv1D (6.759 MB) |O |O 49 | |46-Conv1D (2.253 MB) |O |O 50 | |47-LayerNorm (0.006 MB) |O |O 51 | |48-Conv1D (9.012 MB) |O |O 52 | |49-Conv1D (9.003 MB) |O |O 53 | |50-LayerNorm (0.006 MB) |O |O 54 | |51-Conv1D (6.759 MB) |O |O 55 | |52-Conv1D (2.253 MB) |O |O 56 | |53-LayerNorm (0.006 MB) |O |O 57 | |54-Conv1D (9.012 MB) |O |O 58 | |55-Conv1D (9.003 MB) |O |O 59 | |56-LayerNorm (0.006 MB) |O |O 60 | |57-Conv1D (6.759 MB) |O |O 61 | |58-Conv1D (2.253 MB) |O |O 62 | |59-LayerNorm (0.006 MB) |O |O 63 | |60-Conv1D (9.012 MB) |O |O 64 | |61-Conv1D (9.003 MB) |O |O 65 | |62-LayerNorm (0.006 MB) |O |O 66 | |63-Conv1D (6.759 MB) |O |O 67 | |64-Conv1D (2.253 MB) |O |O 68 | |65-LayerNorm (0.006 MB) |O |O 69 | |66-Conv1D (9.012 MB) |O |O 70 | |67-Conv1D (9.003 MB) |O |O 71 | |68-LayerNorm (0.006 MB) |O |O 72 | |69-Conv1D (6.759 MB) |O |O 73 | |70-Conv1D (2.253 MB) |O |O 74 | |71-LayerNorm (0.006 MB) |O |O 75 | |72-Conv1D (9.012 MB) |O |O 76 | |73-Conv1D (9.003 MB) |O |O 77 | |74-LayerNorm (0.006 MB) |O |O 78 | -------------------------------------------------------------------------------- /plans/V100/gpt2_medium/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Embedding (196.316 MB) |X (direct-host-access) |X (direct-host-access) 4 | |1-Embedding (4.000 MB) |X (direct-host-access) |O 5 | |2-LayerNorm (0.008 MB) |O |O 6 | |3-Conv1D (12.012 MB) |O |O 7 | |4-Conv1D (4.004 MB) |O |O 8 | |5-LayerNorm (0.008 MB) |O |O 9 | |6-Conv1D (16.016 MB) |O |O 10 | |7-Conv1D (16.004 MB) |O |O 11 | |8-LayerNorm (0.008 MB) |O |O 12 | |9-Conv1D (12.012 MB) |O |O 13 | |10-Conv1D (4.004 MB) |O |O 14 | |11-LayerNorm (0.008 MB) |O |O 15 | |12-Conv1D (16.016 MB) |O |O 16 | |13-Conv1D (16.004 MB) |O |O 17 | |14-LayerNorm (0.008 MB) |O |O 18 | |15-Conv1D (12.012 MB) |O |O 19 | |16-Conv1D (4.004 MB) |O |O 20 | |17-LayerNorm (0.008 MB) |O |O 21 | |18-Conv1D (16.016 MB) |O |O 22 | |19-Conv1D (16.004 MB) |O |O 23 | |20-LayerNorm (0.008 MB) |O |O 24 | |21-Conv1D (12.012 MB) |O |O 25 | |22-Conv1D (4.004 MB) |O |O 26 | |23-LayerNorm (0.008 MB) |O |O 27 | |24-Conv1D (16.016 MB) |O |O 28 | |25-Conv1D (16.004 MB) |O |O 29 | |26-LayerNorm (0.008 MB) |O |O 30 | |27-Conv1D (12.012 MB) |O |O 31 | |28-Conv1D (4.004 MB) |O |O 32 | |29-LayerNorm (0.008 MB) |O |O 33 | |30-Conv1D (16.016 MB) |O |O 34 | |31-Conv1D (16.004 MB) |O |O 35 | |32-LayerNorm (0.008 MB) |O |O 36 | |33-Conv1D (12.012 MB) |O |O 37 | |34-Conv1D (4.004 MB) |O |O 38 | |35-LayerNorm (0.008 MB) |O |O 39 | |36-Conv1D (16.016 MB) |O |O 40 | |37-Conv1D (16.004 MB) |O |O 41 | |38-LayerNorm (0.008 MB) |O |O 42 | |39-Conv1D (12.012 MB) |O |O 43 | |40-Conv1D (4.004 MB) |O |O 44 | |41-LayerNorm (0.008 MB) |O |O 45 | |42-Conv1D (16.016 MB) |O |O 46 | |43-Conv1D (16.004 MB) |O |O 47 | |44-LayerNorm (0.008 MB) |O |O 48 | |45-Conv1D (12.012 MB) |O |O 49 | |46-Conv1D (4.004 MB) |O |O 50 | |47-LayerNorm (0.008 MB) |O |O 51 | |48-Conv1D (16.016 MB) |O |O 52 | |49-Conv1D (16.004 MB) |O |O 53 | |50-LayerNorm (0.008 MB) |O |O 54 | |51-Conv1D (12.012 MB) |O |O 55 | |52-Conv1D (4.004 MB) |O |O 56 | |53-LayerNorm (0.008 MB) |O |O 57 | |54-Conv1D (16.016 MB) |O |O 58 | |55-Conv1D (16.004 MB) |O |O 59 | |56-LayerNorm (0.008 MB) |O |O 60 | |57-Conv1D (12.012 MB) |O |O 61 | |58-Conv1D (4.004 MB) |O |O 62 | |59-LayerNorm (0.008 MB) |O |O 63 | |60-Conv1D (16.016 MB) |O |O 64 | |61-Conv1D (16.004 MB) |O |O 65 | |62-LayerNorm (0.008 MB) |O |O 66 | |63-Conv1D (12.012 MB) |O |O 67 | |64-Conv1D (4.004 MB) |O |O 68 | |65-LayerNorm (0.008 MB) |O |O 69 | |66-Conv1D (16.016 MB) |O |O 70 | |67-Conv1D (16.004 MB) |O |O 71 | |68-LayerNorm (0.008 MB) |O |O 72 | |69-Conv1D (12.012 MB) |O |O 73 | |70-Conv1D (4.004 MB) |O |O 74 | |71-LayerNorm (0.008 MB) |O |O 75 | |72-Conv1D (16.016 MB) |O |O 76 | |73-Conv1D (16.004 MB) |O |O 77 | |74-LayerNorm (0.008 MB) |O |O 78 | |75-Conv1D (12.012 MB) |O |O 79 | |76-Conv1D (4.004 MB) |O |O 80 | |77-LayerNorm (0.008 MB) |O |O 81 | |78-Conv1D (16.016 MB) |O |O 82 | |79-Conv1D (16.004 MB) |O |O 83 | |80-LayerNorm (0.008 MB) |O |O 84 | |81-Conv1D (12.012 MB) |O |O 85 | |82-Conv1D (4.004 MB) |O |O 86 | |83-LayerNorm (0.008 MB) |O |O 87 | |84-Conv1D (16.016 MB) |O |O 88 | |85-Conv1D (16.004 MB) |O |O 89 | |86-LayerNorm (0.008 MB) |O |O 90 | |87-Conv1D (12.012 MB) |O |O 91 | |88-Conv1D (4.004 MB) |O |O 92 | |89-LayerNorm (0.008 MB) |O |O 93 | |90-Conv1D (16.016 MB) |O |O 94 | |91-Conv1D (16.004 MB) |O |O 95 | |92-LayerNorm (0.008 MB) |O |O 96 | |93-Conv1D (12.012 MB) |O |O 97 | |94-Conv1D (4.004 MB) |O |O 98 | |95-LayerNorm (0.008 MB) |O |O 99 | |96-Conv1D (16.016 MB) |O |O 100 | |97-Conv1D (16.004 MB) |O |O 101 | |98-LayerNorm (0.008 MB) |O |O 102 | |99-Conv1D (12.012 MB) |O |O 103 | |100-Conv1D (4.004 MB) |O |O 104 | |101-LayerNorm (0.008 MB) |O |O 105 | |102-Conv1D (16.016 MB) |O |O 106 | |103-Conv1D (16.004 MB) |O |O 107 | |104-LayerNorm (0.008 MB) |O |O 108 | |105-Conv1D (12.012 MB) |O |O 109 | |106-Conv1D (4.004 MB) |O |O 110 | |107-LayerNorm (0.008 MB) |O |O 111 | |108-Conv1D (16.016 MB) |O |O 112 | |109-Conv1D (16.004 MB) |O |O 113 | |110-LayerNorm (0.008 MB) |O |O 114 | |111-Conv1D (12.012 MB) |O |O 115 | |112-Conv1D (4.004 MB) |O |O 116 | |113-LayerNorm (0.008 MB) |O |O 117 | |114-Conv1D (16.016 MB) |O |O 118 | |115-Conv1D (16.004 MB) |O |O 119 | |116-LayerNorm (0.008 MB) |O |O 120 | |117-Conv1D (12.012 MB) |O |O 121 | |118-Conv1D (4.004 MB) |O |O 122 | |119-LayerNorm (0.008 MB) |O |O 123 | |120-Conv1D (16.016 MB) |O |O 124 | |121-Conv1D (16.004 MB) |O |O 125 | |122-LayerNorm (0.008 MB) |O |O 126 | |123-Conv1D (12.012 MB) |O |O 127 | |124-Conv1D (4.004 MB) |O |O 128 | |125-LayerNorm (0.008 MB) |O |O 129 | |126-Conv1D (16.016 MB) |O |O 130 | |127-Conv1D (16.004 MB) |O |O 131 | |128-LayerNorm (0.008 MB) |O |O 132 | |129-Conv1D (12.012 MB) |O |O 133 | |130-Conv1D (4.004 MB) |O |O 134 | |131-LayerNorm (0.008 MB) |O |O 135 | |132-Conv1D (16.016 MB) |O |O 136 | |133-Conv1D (16.004 MB) |O |O 137 | |134-LayerNorm (0.008 MB) |O |O 138 | |135-Conv1D (12.012 MB) |O |O 139 | |136-Conv1D (4.004 MB) |O |O 140 | |137-LayerNorm (0.008 MB) |O |O 141 | |138-Conv1D (16.016 MB) |O |O 142 | |139-Conv1D (16.004 MB) |O |O 143 | |140-LayerNorm (0.008 MB) |O |O 144 | |141-Conv1D (12.012 MB) |O |O 145 | |142-Conv1D (4.004 MB) |O |O 146 | |143-LayerNorm (0.008 MB) |O |O 147 | |144-Conv1D (16.016 MB) |O |O 148 | |145-Conv1D (16.004 MB) |O |O 149 | |146-LayerNorm (0.008 MB) |O |O 150 | -------------------------------------------------------------------------------- /plans/V100/resnet50/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Conv2d (0.036 MB) |O |O 4 | |1-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 5 | |2-ReLU (0.000 MB) |X |X 6 | |3-MaxPool2d (0.000 MB) |X |X 7 | |4-Conv2d (0.016 MB) |O |O 8 | |5-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 9 | |6-ReLU (0.000 MB) |X |X 10 | |7-Conv2d (0.141 MB) |X (direct-host-access) |X (direct-host-access) 11 | |8-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 12 | |9-ReLU (0.000 MB) |X |X 13 | |10-Conv2d (0.062 MB) |O |O 14 | |11-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 15 | |12-Conv2d (0.062 MB) |O |O 16 | |13-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 17 | |14-ReLU (0.000 MB) |X |X 18 | |15-Conv2d (0.062 MB) |O |O 19 | |16-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 20 | |17-ReLU (0.000 MB) |X |X 21 | |18-Conv2d (0.141 MB) |X (direct-host-access) |X (direct-host-access) 22 | |19-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 23 | |20-ReLU (0.000 MB) |X |X 24 | |21-Conv2d (0.062 MB) |O |O 25 | |22-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 26 | |23-ReLU (0.000 MB) |X |X 27 | |24-Conv2d (0.062 MB) |O |O 28 | |25-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 29 | |26-ReLU (0.000 MB) |X |X 30 | |27-Conv2d (0.141 MB) |X (direct-host-access) |X (direct-host-access) 31 | |28-BatchNorm2d (0.001 MB) |X (direct-host-access) |X (direct-host-access) 32 | |29-ReLU (0.000 MB) |X |X 33 | |30-Conv2d (0.062 MB) |O |O 34 | |31-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 35 | |32-ReLU (0.000 MB) |X |X 36 | |33-Conv2d (0.125 MB) |O |O 37 | |34-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 38 | |35-ReLU (0.000 MB) |X |X 39 | |36-Conv2d (0.562 MB) |X (direct-host-access) |X (direct-host-access) 40 | |37-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 41 | |38-ReLU (0.000 MB) |X |X 42 | |39-Conv2d (0.250 MB) |O |O 43 | |40-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 44 | |41-Conv2d (0.500 MB) |O |O 45 | |42-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 46 | |43-ReLU (0.000 MB) |X |X 47 | |44-Conv2d (0.250 MB) |X (direct-host-access) |O 48 | |45-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 49 | |46-ReLU (0.000 MB) |X |X 50 | |47-Conv2d (0.562 MB) |X (direct-host-access) |X (direct-host-access) 51 | |48-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 52 | |49-ReLU (0.000 MB) |X |X 53 | |50-Conv2d (0.250 MB) |O |O 54 | |51-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 55 | |52-ReLU (0.000 MB) |X |X 56 | |53-Conv2d (0.250 MB) |X (direct-host-access) |O 57 | |54-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 58 | |55-ReLU (0.000 MB) |X |X 59 | |56-Conv2d (0.562 MB) |X (direct-host-access) |X (direct-host-access) 60 | |57-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 61 | |58-ReLU (0.000 MB) |X |X 62 | |59-Conv2d (0.250 MB) |O |O 63 | |60-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 64 | |61-ReLU (0.000 MB) |X |X 65 | |62-Conv2d (0.250 MB) |X (direct-host-access) |O 66 | |63-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 67 | |64-ReLU (0.000 MB) |X |X 68 | |65-Conv2d (0.562 MB) |X (direct-host-access) |X (direct-host-access) 69 | |66-BatchNorm2d (0.002 MB) |X (direct-host-access) |X (direct-host-access) 70 | |67-ReLU (0.000 MB) |X |X 71 | |68-Conv2d (0.250 MB) |O |O 72 | |69-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 73 | |70-ReLU (0.000 MB) |X |X 74 | |71-Conv2d (0.500 MB) |O |O 75 | |72-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 76 | |73-ReLU (0.000 MB) |X |X 77 | |74-Conv2d (2.250 MB) |O |O 78 | |75-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 79 | |76-ReLU (0.000 MB) |X |X 80 | |77-Conv2d (1.000 MB) |X (direct-host-access) |O 81 | |78-BatchNorm2d (0.020 MB) |X (direct-host-access) |O 82 | |79-Conv2d (2.000 MB) |O |O 83 | |80-BatchNorm2d (0.020 MB) |X (direct-host-access) |O 84 | |81-ReLU (0.000 MB) |X |X 85 | |82-Conv2d (1.000 MB) |X (direct-host-access) |O 86 | |83-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 87 | |84-ReLU (0.000 MB) |X |X 88 | |85-Conv2d (2.250 MB) |O |O 89 | |86-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 90 | |87-ReLU (0.000 MB) |X |X 91 | |88-Conv2d (1.000 MB) |X (direct-host-access) |O 92 | |89-BatchNorm2d (0.020 MB) |X (direct-host-access) |X (direct-host-access) 93 | |90-ReLU (0.000 MB) |X |X 94 | |91-Conv2d (1.000 MB) |X (direct-host-access) |O 95 | |92-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 96 | |93-ReLU (0.000 MB) |X |X 97 | |94-Conv2d (2.250 MB) |O |O 98 | |95-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 99 | |96-ReLU (0.000 MB) |X |X 100 | |97-Conv2d (1.000 MB) |X (direct-host-access) |O 101 | |98-BatchNorm2d (0.020 MB) |X (direct-host-access) |O 102 | |99-ReLU (0.000 MB) |X |X 103 | |100-Conv2d (1.000 MB) |X (direct-host-access) |O 104 | |101-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 105 | |102-ReLU (0.000 MB) |X |X 106 | |103-Conv2d (2.250 MB) |O |O 107 | |104-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 108 | |105-ReLU (0.000 MB) |X |X 109 | |106-Conv2d (1.000 MB) |X (direct-host-access) |O 110 | |107-BatchNorm2d (0.020 MB) |X (direct-host-access) |O 111 | |108-ReLU (0.000 MB) |X |X 112 | |109-Conv2d (1.000 MB) |X (direct-host-access) |O 113 | |110-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 114 | |111-ReLU (0.000 MB) |X |X 115 | |112-Conv2d (2.250 MB) |O |O 116 | |113-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 117 | |114-ReLU (0.000 MB) |X |X 118 | |115-Conv2d (1.000 MB) |X (direct-host-access) |O 119 | |116-BatchNorm2d (0.020 MB) |X (direct-host-access) |O 120 | |117-ReLU (0.000 MB) |X |X 121 | |118-Conv2d (1.000 MB) |X (direct-host-access) |O 122 | |119-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 123 | |120-ReLU (0.000 MB) |X |X 124 | |121-Conv2d (2.250 MB) |O |O 125 | |122-BatchNorm2d (0.005 MB) |X (direct-host-access) |X (direct-host-access) 126 | |123-ReLU (0.000 MB) |X |X 127 | |124-Conv2d (1.000 MB) |X (direct-host-access) |O 128 | |125-BatchNorm2d (0.020 MB) |X (direct-host-access) |O 129 | |126-ReLU (0.000 MB) |X |X 130 | |127-Conv2d (2.000 MB) |X (direct-host-access) |O 131 | |128-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 132 | |129-ReLU (0.000 MB) |X |X 133 | |130-Conv2d (9.000 MB) |O |O 134 | |131-BatchNorm2d (0.010 MB) |X (direct-host-access) |O 135 | |132-ReLU (0.000 MB) |X |X 136 | |133-Conv2d (4.000 MB) |X (direct-host-access) |O 137 | |134-BatchNorm2d (0.039 MB) |X (direct-host-access) |O 138 | |135-Conv2d (8.000 MB) |X (direct-host-access) |O 139 | |136-BatchNorm2d (0.039 MB) |X (direct-host-access) |O 140 | |137-ReLU (0.000 MB) |X |X 141 | |138-Conv2d (4.000 MB) |X (direct-host-access) |O 142 | |139-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 143 | |140-ReLU (0.000 MB) |X |X 144 | |141-Conv2d (9.000 MB) |O |O 145 | |142-BatchNorm2d (0.010 MB) |X (direct-host-access) |O 146 | |143-ReLU (0.000 MB) |X |X 147 | |144-Conv2d (4.000 MB) |X (direct-host-access) |O 148 | |145-BatchNorm2d (0.039 MB) |X (direct-host-access) |O 149 | |146-ReLU (0.000 MB) |X |X 150 | |147-Conv2d (4.000 MB) |X (direct-host-access) |O 151 | |148-BatchNorm2d (0.010 MB) |X (direct-host-access) |X (direct-host-access) 152 | |149-ReLU (0.000 MB) |X |X 153 | |150-Conv2d (9.000 MB) |O |O 154 | |151-BatchNorm2d (0.010 MB) |X (direct-host-access) |O 155 | |152-ReLU (0.000 MB) |X |X 156 | |153-Conv2d (4.000 MB) |X (direct-host-access) |O 157 | |154-BatchNorm2d (0.039 MB) |X (direct-host-access) |O 158 | |155-ReLU (0.000 MB) |X |X 159 | |156-AdaptiveAvgPool2d (0.000 MB) |X |X 160 | |157-Linear (7.816 MB) |X (direct-host-access) |O 161 | -------------------------------------------------------------------------------- /plans/V100/roberta_base/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Embedding (147.261 MB) |X (direct-host-access) |X (direct-host-access) 4 | |1-Embedding (1.506 MB) |X (direct-host-access) |O 5 | |2-Embedding (0.003 MB) |X (direct-host-access) |O 6 | |3-LayerNorm (0.006 MB) |O |O 7 | |4-Linear (2.253 MB) |O |O 8 | |5-Linear (2.253 MB) |O |O 9 | |6-Linear (2.253 MB) |O |O 10 | |7-Linear (2.253 MB) |O |O 11 | |8-LayerNorm (0.006 MB) |O |O 12 | |9-Linear (9.012 MB) |O |O 13 | |10-Linear (9.003 MB) |O |O 14 | |11-LayerNorm (0.006 MB) |O |O 15 | |12-Linear (2.253 MB) |O |O 16 | |13-Linear (2.253 MB) |O |O 17 | |14-Linear (2.253 MB) |O |O 18 | |15-Linear (2.253 MB) |O |O 19 | |16-LayerNorm (0.006 MB) |O |O 20 | |17-Linear (9.012 MB) |O |O 21 | |18-Linear (9.003 MB) |O |O 22 | |19-LayerNorm (0.006 MB) |O |O 23 | |20-Linear (2.253 MB) |O |O 24 | |21-Linear (2.253 MB) |O |O 25 | |22-Linear (2.253 MB) |O |O 26 | |23-Linear (2.253 MB) |O |O 27 | |24-LayerNorm (0.006 MB) |O |O 28 | |25-Linear (9.012 MB) |O |O 29 | |26-Linear (9.003 MB) |O |O 30 | |27-LayerNorm (0.006 MB) |O |O 31 | |28-Linear (2.253 MB) |O |O 32 | |29-Linear (2.253 MB) |O |O 33 | |30-Linear (2.253 MB) |O |O 34 | |31-Linear (2.253 MB) |O |O 35 | |32-LayerNorm (0.006 MB) |O |O 36 | |33-Linear (9.012 MB) |O |O 37 | |34-Linear (9.003 MB) |O |O 38 | |35-LayerNorm (0.006 MB) |O |O 39 | |36-Linear (2.253 MB) |O |O 40 | |37-Linear (2.253 MB) |O |O 41 | |38-Linear (2.253 MB) |O |O 42 | |39-Linear (2.253 MB) |O |O 43 | |40-LayerNorm (0.006 MB) |O |O 44 | |41-Linear (9.012 MB) |O |O 45 | |42-Linear (9.003 MB) |O |O 46 | |43-LayerNorm (0.006 MB) |O |O 47 | |44-Linear (2.253 MB) |O |O 48 | |45-Linear (2.253 MB) |O |O 49 | |46-Linear (2.253 MB) |O |O 50 | |47-Linear (2.253 MB) |O |O 51 | |48-LayerNorm (0.006 MB) |O |O 52 | |49-Linear (9.012 MB) |O |O 53 | |50-Linear (9.003 MB) |O |O 54 | |51-LayerNorm (0.006 MB) |O |O 55 | |52-Linear (2.253 MB) |O |O 56 | |53-Linear (2.253 MB) |O |O 57 | |54-Linear (2.253 MB) |O |O 58 | |55-Linear (2.253 MB) |O |O 59 | |56-LayerNorm (0.006 MB) |O |O 60 | |57-Linear (9.012 MB) |O |O 61 | |58-Linear (9.003 MB) |O |O 62 | |59-LayerNorm (0.006 MB) |O |O 63 | |60-Linear (2.253 MB) |O |O 64 | |61-Linear (2.253 MB) |O |O 65 | |62-Linear (2.253 MB) |O |O 66 | |63-Linear (2.253 MB) |O |O 67 | |64-LayerNorm (0.006 MB) |O |O 68 | |65-Linear (9.012 MB) |O |O 69 | |66-Linear (9.003 MB) |O |O 70 | |67-LayerNorm (0.006 MB) |O |O 71 | |68-Linear (2.253 MB) |O |O 72 | |69-Linear (2.253 MB) |O |O 73 | |70-Linear (2.253 MB) |O |O 74 | |71-Linear (2.253 MB) |O |O 75 | |72-LayerNorm (0.006 MB) |O |O 76 | |73-Linear (9.012 MB) |O |O 77 | |74-Linear (9.003 MB) |O |O 78 | |75-LayerNorm (0.006 MB) |O |O 79 | |76-Linear (2.253 MB) |O |O 80 | |77-Linear (2.253 MB) |O |O 81 | |78-Linear (2.253 MB) |O |O 82 | |79-Linear (2.253 MB) |O |O 83 | |80-LayerNorm (0.006 MB) |O |O 84 | |81-Linear (9.012 MB) |O |O 85 | |82-Linear (9.003 MB) |O |O 86 | |83-LayerNorm (0.006 MB) |O |O 87 | |84-Linear (2.253 MB) |O |O 88 | |85-Linear (2.253 MB) |O |O 89 | |86-Linear (2.253 MB) |O |O 90 | |87-Linear (2.253 MB) |O |O 91 | |88-LayerNorm (0.006 MB) |O |O 92 | |89-Linear (9.012 MB) |O |O 93 | |90-Linear (9.003 MB) |O |O 94 | |91-LayerNorm (0.006 MB) |O |O 95 | |92-Linear (2.253 MB) |O |O 96 | |93-Linear (2.253 MB) |O |O 97 | |94-Linear (2.253 MB) |O |O 98 | |95-Linear (2.253 MB) |O |O 99 | |96-LayerNorm (0.006 MB) |O |O 100 | |97-Linear (9.012 MB) |O |O 101 | |98-Linear (9.003 MB) |O |O 102 | |99-LayerNorm (0.006 MB) |O |O 103 | |100-Linear (2.253 MB) |X (direct-host-access) |O 104 | |101-Tanh (0.000 MB) |X |X 105 | -------------------------------------------------------------------------------- /plans/V100/roberta_large/README.md: -------------------------------------------------------------------------------- 1 | |Layer |Initial approach |DeepPlan (DHA) 2 | |-------------------------|-------------------------|------------------------- 3 | |0-Embedding (196.348 MB) |X (direct-host-access) |X (direct-host-access) 4 | |1-Embedding (2.008 MB) |X (direct-host-access) |O 5 | |2-Embedding (0.004 MB) |X (direct-host-access) |O 6 | |3-LayerNorm (0.008 MB) |O |O 7 | |4-Linear (4.004 MB) |O |O 8 | |5-Linear (4.004 MB) |O |O 9 | |6-Linear (4.004 MB) |O |O 10 | |7-Linear (4.004 MB) |O |O 11 | |8-LayerNorm (0.008 MB) |O |O 12 | |9-Linear (16.016 MB) |O |O 13 | |10-Linear (16.004 MB) |O |O 14 | |11-LayerNorm (0.008 MB) |O |O 15 | |12-Linear (4.004 MB) |O |O 16 | |13-Linear (4.004 MB) |O |O 17 | |14-Linear (4.004 MB) |O |O 18 | |15-Linear (4.004 MB) |O |O 19 | |16-LayerNorm (0.008 MB) |O |O 20 | |17-Linear (16.016 MB) |O |O 21 | |18-Linear (16.004 MB) |O |O 22 | |19-LayerNorm (0.008 MB) |O |O 23 | |20-Linear (4.004 MB) |O |O 24 | |21-Linear (4.004 MB) |O |O 25 | |22-Linear (4.004 MB) |O |O 26 | |23-Linear (4.004 MB) |O |O 27 | |24-LayerNorm (0.008 MB) |O |O 28 | |25-Linear (16.016 MB) |O |O 29 | |26-Linear (16.004 MB) |O |O 30 | |27-LayerNorm (0.008 MB) |O |O 31 | |28-Linear (4.004 MB) |O |O 32 | |29-Linear (4.004 MB) |O |O 33 | |30-Linear (4.004 MB) |O |O 34 | |31-Linear (4.004 MB) |O |O 35 | |32-LayerNorm (0.008 MB) |O |O 36 | |33-Linear (16.016 MB) |O |O 37 | |34-Linear (16.004 MB) |O |O 38 | |35-LayerNorm (0.008 MB) |O |O 39 | |36-Linear (4.004 MB) |O |O 40 | |37-Linear (4.004 MB) |O |O 41 | |38-Linear (4.004 MB) |O |O 42 | |39-Linear (4.004 MB) |O |O 43 | |40-LayerNorm (0.008 MB) |O |O 44 | |41-Linear (16.016 MB) |O |O 45 | |42-Linear (16.004 MB) |O |O 46 | |43-LayerNorm (0.008 MB) |O |O 47 | |44-Linear (4.004 MB) |O |O 48 | |45-Linear (4.004 MB) |O |O 49 | |46-Linear (4.004 MB) |O |O 50 | |47-Linear (4.004 MB) |O |O 51 | |48-LayerNorm (0.008 MB) |O |O 52 | |49-Linear (16.016 MB) |O |O 53 | |50-Linear (16.004 MB) |O |O 54 | |51-LayerNorm (0.008 MB) |O |O 55 | |52-Linear (4.004 MB) |O |O 56 | |53-Linear (4.004 MB) |O |O 57 | |54-Linear (4.004 MB) |O |O 58 | |55-Linear (4.004 MB) |O |O 59 | |56-LayerNorm (0.008 MB) |O |O 60 | |57-Linear (16.016 MB) |O |O 61 | |58-Linear (16.004 MB) |O |O 62 | |59-LayerNorm (0.008 MB) |O |O 63 | |60-Linear (4.004 MB) |O |O 64 | |61-Linear (4.004 MB) |O |O 65 | |62-Linear (4.004 MB) |O |O 66 | |63-Linear (4.004 MB) |O |O 67 | |64-LayerNorm (0.008 MB) |O |O 68 | |65-Linear (16.016 MB) |O |O 69 | |66-Linear (16.004 MB) |O |O 70 | |67-LayerNorm (0.008 MB) |O |O 71 | |68-Linear (4.004 MB) |O |O 72 | |69-Linear (4.004 MB) |O |O 73 | |70-Linear (4.004 MB) |O |O 74 | |71-Linear (4.004 MB) |O |O 75 | |72-LayerNorm (0.008 MB) |O |O 76 | |73-Linear (16.016 MB) |O |O 77 | |74-Linear (16.004 MB) |O |O 78 | |75-LayerNorm (0.008 MB) |O |O 79 | |76-Linear (4.004 MB) |O |O 80 | |77-Linear (4.004 MB) |O |O 81 | |78-Linear (4.004 MB) |O |O 82 | |79-Linear (4.004 MB) |O |O 83 | |80-LayerNorm (0.008 MB) |O |O 84 | |81-Linear (16.016 MB) |O |O 85 | |82-Linear (16.004 MB) |O |O 86 | |83-LayerNorm (0.008 MB) |O |O 87 | |84-Linear (4.004 MB) |O |O 88 | |85-Linear (4.004 MB) |O |O 89 | |86-Linear (4.004 MB) |O |O 90 | |87-Linear (4.004 MB) |O |O 91 | |88-LayerNorm (0.008 MB) |O |O 92 | |89-Linear (16.016 MB) |O |O 93 | |90-Linear (16.004 MB) |O |O 94 | |91-LayerNorm (0.008 MB) |O |O 95 | |92-Linear (4.004 MB) |O |O 96 | |93-Linear (4.004 MB) |O |O 97 | |94-Linear (4.004 MB) |O |O 98 | |95-Linear (4.004 MB) |O |O 99 | |96-LayerNorm (0.008 MB) |O |O 100 | |97-Linear (16.016 MB) |O |O 101 | |98-Linear (16.004 MB) |O |O 102 | |99-LayerNorm (0.008 MB) |O |O 103 | |100-Linear (4.004 MB) |O |O 104 | |101-Linear (4.004 MB) |O |O 105 | |102-Linear (4.004 MB) |O |O 106 | |103-Linear (4.004 MB) |O |O 107 | |104-LayerNorm (0.008 MB) |O |O 108 | |105-Linear (16.016 MB) |O |O 109 | |106-Linear (16.004 MB) |O |O 110 | |107-LayerNorm (0.008 MB) |O |O 111 | |108-Linear (4.004 MB) |O |O 112 | |109-Linear (4.004 MB) |O |O 113 | |110-Linear (4.004 MB) |O |O 114 | |111-Linear (4.004 MB) |O |O 115 | |112-LayerNorm (0.008 MB) |O |O 116 | |113-Linear (16.016 MB) |O |O 117 | |114-Linear (16.004 MB) |O |O 118 | |115-LayerNorm (0.008 MB) |O |O 119 | |116-Linear (4.004 MB) |O |O 120 | |117-Linear (4.004 MB) |O |O 121 | |118-Linear (4.004 MB) |O |O 122 | |119-Linear (4.004 MB) |O |O 123 | |120-LayerNorm (0.008 MB) |O |O 124 | |121-Linear (16.016 MB) |O |O 125 | |122-Linear (16.004 MB) |O |O 126 | |123-LayerNorm (0.008 MB) |O |O 127 | |124-Linear (4.004 MB) |O |O 128 | |125-Linear (4.004 MB) |O |O 129 | |126-Linear (4.004 MB) |O |O 130 | |127-Linear (4.004 MB) |O |O 131 | |128-LayerNorm (0.008 MB) |O |O 132 | |129-Linear (16.016 MB) |O |O 133 | |130-Linear (16.004 MB) |O |O 134 | |131-LayerNorm (0.008 MB) |O |O 135 | |132-Linear (4.004 MB) |O |O 136 | |133-Linear (4.004 MB) |O |O 137 | |134-Linear (4.004 MB) |O |O 138 | |135-Linear (4.004 MB) |O |O 139 | |136-LayerNorm (0.008 MB) |O |O 140 | |137-Linear (16.016 MB) |O |O 141 | |138-Linear (16.004 MB) |O |O 142 | |139-LayerNorm (0.008 MB) |O |O 143 | |140-Linear (4.004 MB) |O |O 144 | |141-Linear (4.004 MB) |O |O 145 | |142-Linear (4.004 MB) |O |O 146 | |143-Linear (4.004 MB) |O |O 147 | |144-LayerNorm (0.008 MB) |O |O 148 | |145-Linear (16.016 MB) |O |O 149 | |146-Linear (16.004 MB) |O |O 150 | |147-LayerNorm (0.008 MB) |O |O 151 | |148-Linear (4.004 MB) |O |O 152 | |149-Linear (4.004 MB) |O |O 153 | |150-Linear (4.004 MB) |O |O 154 | |151-Linear (4.004 MB) |O |O 155 | |152-LayerNorm (0.008 MB) |O |O 156 | |153-Linear (16.016 MB) |O |O 157 | |154-Linear (16.004 MB) |O |O 158 | |155-LayerNorm (0.008 MB) |O |O 159 | |156-Linear (4.004 MB) |O |O 160 | |157-Linear (4.004 MB) |O |O 161 | |158-Linear (4.004 MB) |O |O 162 | |159-Linear (4.004 MB) |O |O 163 | |160-LayerNorm (0.008 MB) |O |O 164 | |161-Linear (16.016 MB) |O |O 165 | |162-Linear (16.004 MB) |O |O 166 | |163-LayerNorm (0.008 MB) |O |O 167 | |164-Linear (4.004 MB) |O |O 168 | |165-Linear (4.004 MB) |O |O 169 | |166-Linear (4.004 MB) |O |O 170 | |167-Linear (4.004 MB) |O |O 171 | |168-LayerNorm (0.008 MB) |O |O 172 | |169-Linear (16.016 MB) |O |O 173 | |170-Linear (16.004 MB) |O |O 174 | |171-LayerNorm (0.008 MB) |O |O 175 | |172-Linear (4.004 MB) |O |O 176 | |173-Linear (4.004 MB) |O |O 177 | |174-Linear (4.004 MB) |O |O 178 | |175-Linear (4.004 MB) |O |O 179 | |176-LayerNorm (0.008 MB) |O |O 180 | |177-Linear (16.016 MB) |O |O 181 | |178-Linear (16.004 MB) |O |O 182 | |179-LayerNorm (0.008 MB) |O |O 183 | |180-Linear (4.004 MB) |O |O 184 | |181-Linear (4.004 MB) |O |O 185 | |182-Linear (4.004 MB) |O |O 186 | |183-Linear (4.004 MB) |O |O 187 | |184-LayerNorm (0.008 MB) |O |O 188 | |185-Linear (16.016 MB) |O |O 189 | |186-Linear (16.004 MB) |O |O 190 | |187-LayerNorm (0.008 MB) |O |O 191 | |188-Linear (4.004 MB) |O |O 192 | |189-Linear (4.004 MB) |O |O 193 | |190-Linear (4.004 MB) |O |O 194 | |191-Linear (4.004 MB) |O |O 195 | |192-LayerNorm (0.008 MB) |O |O 196 | |193-Linear (16.016 MB) |O |O 197 | |194-Linear (16.004 MB) |O |O 198 | |195-LayerNorm (0.008 MB) |O |O 199 | |196-Linear (4.004 MB) |O |O 200 | |197-Tanh (0.000 MB) |X |X 201 | -------------------------------------------------------------------------------- /proto/deepcache.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | enum MsgType { 4 | REQ_INFERENCE = 1; 5 | REQ_UPLOAD_MODEL = 2; 6 | REQ_LS = 3; 7 | REQ_CLOSE = 4; 8 | 9 | RSP_INFERENCE = 101; 10 | RSP_UPLOAD_MODEL = 102; 11 | RSP_LS = 103; 12 | RSP_CLOSE = 104; 13 | } 14 | 15 | message InferenceReqProto { 16 | required uint32 req_id = 1; 17 | required uint32 model_id = 2; 18 | required uint32 batch_size = 3; 19 | } 20 | 21 | message InferenceRspProto { 22 | required uint32 req_id = 1; 23 | required bool is_cold = 2; 24 | } 25 | 26 | message UploadModelReqProto { 27 | required uint32 req_id = 1; 28 | repeated string model_names = 2; 29 | required uint32 n_models = 3; 30 | required uint32 engine_type = 4; 31 | required uint32 mp_size = 5; 32 | } 33 | 34 | message UploadModelRspProto { 35 | required uint32 req_id = 1; 36 | } 37 | 38 | message CloseReqProto { 39 | required uint32 req_id = 1; 40 | } 41 | 42 | message CloseRspProto { 43 | required uint32 req_id = 1; 44 | } 45 | -------------------------------------------------------------------------------- /proto/deepplan.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | enum DataType { 4 | TYPE_FP32 = 1; 5 | TYPE_INT32 = 2; 6 | TYPE_INT64 = 3; 7 | } 8 | 9 | message ModelInput { 10 | required DataType data_type = 1; 11 | repeated uint32 shape = 2; 12 | optional uint32 max_number = 3; 13 | } 14 | 15 | message Plan { 16 | enum PlanType { 17 | STATIC = 1; 18 | DYNAMIC = 2; 19 | BENCH_DYNAMIC = 3; 20 | } 21 | 22 | required PlanType plan_type = 1; 23 | repeated uint32 load_layers = 2; 24 | } 25 | 26 | message ModelConfig { 27 | required string model_name = 1; 28 | repeated ModelInput inputs = 2; 29 | repeated Plan plans = 3; 30 | } 31 | -------------------------------------------------------------------------------- /proto/deepplan_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # source: deepplan.proto 4 | 5 | from google.protobuf.internal import enum_type_wrapper 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | # @@protoc_insertion_point(imports) 11 | 12 | _sym_db = _symbol_database.Default() 13 | 14 | 15 | 16 | 17 | DESCRIPTOR = _descriptor.FileDescriptor( 18 | name='deepplan.proto', 19 | package='', 20 | syntax='proto2', 21 | serialized_options=None, 22 | serialized_pb=b'\n\x0e\x64\x65\x65pplan.proto\"M\n\nModelInput\x12\x1c\n\tdata_type\x18\x01 \x02(\x0e\x32\t.DataType\x12\r\n\x05shape\x18\x02 \x03(\r\x12\x12\n\nmax_number\x18\x03 \x01(\r\"v\n\x04Plan\x12!\n\tplan_type\x18\x01 \x02(\x0e\x32\x0e.Plan.PlanType\x12\x13\n\x0bload_layers\x18\x02 \x03(\r\"6\n\x08PlanType\x12\n\n\x06STATIC\x10\x01\x12\x0b\n\x07\x44YNAMIC\x10\x02\x12\x11\n\rBENCH_DYNAMIC\x10\x03\"T\n\x0bModelConfig\x12\x12\n\nmodel_name\x18\x01 \x02(\t\x12\x1b\n\x06inputs\x18\x02 \x03(\x0b\x32\x0b.ModelInput\x12\x14\n\x05plans\x18\x03 \x03(\x0b\x32\x05.Plan*9\n\x08\x44\x61taType\x12\r\n\tTYPE_FP32\x10\x01\x12\x0e\n\nTYPE_INT32\x10\x02\x12\x0e\n\nTYPE_INT64\x10\x03' 23 | ) 24 | 25 | _DATATYPE = _descriptor.EnumDescriptor( 26 | name='DataType', 27 | full_name='DataType', 28 | filename=None, 29 | file=DESCRIPTOR, 30 | values=[ 31 | _descriptor.EnumValueDescriptor( 32 | name='TYPE_FP32', index=0, number=1, 33 | serialized_options=None, 34 | type=None), 35 | _descriptor.EnumValueDescriptor( 36 | name='TYPE_INT32', index=1, number=2, 37 | serialized_options=None, 38 | type=None), 39 | _descriptor.EnumValueDescriptor( 40 | name='TYPE_INT64', index=2, number=3, 41 | serialized_options=None, 42 | type=None), 43 | ], 44 | containing_type=None, 45 | serialized_options=None, 46 | serialized_start=303, 47 | serialized_end=360, 48 | ) 49 | _sym_db.RegisterEnumDescriptor(_DATATYPE) 50 | 51 | DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE) 52 | TYPE_FP32 = 1 53 | TYPE_INT32 = 2 54 | TYPE_INT64 = 3 55 | 56 | 57 | _PLAN_PLANTYPE = _descriptor.EnumDescriptor( 58 | name='PlanType', 59 | full_name='Plan.PlanType', 60 | filename=None, 61 | file=DESCRIPTOR, 62 | values=[ 63 | _descriptor.EnumValueDescriptor( 64 | name='STATIC', index=0, number=1, 65 | serialized_options=None, 66 | type=None), 67 | _descriptor.EnumValueDescriptor( 68 | name='DYNAMIC', index=1, number=2, 69 | serialized_options=None, 70 | type=None), 71 | _descriptor.EnumValueDescriptor( 72 | name='BENCH_DYNAMIC', index=2, number=3, 73 | serialized_options=None, 74 | type=None), 75 | ], 76 | containing_type=None, 77 | serialized_options=None, 78 | serialized_start=161, 79 | serialized_end=215, 80 | ) 81 | _sym_db.RegisterEnumDescriptor(_PLAN_PLANTYPE) 82 | 83 | 84 | _MODELINPUT = _descriptor.Descriptor( 85 | name='ModelInput', 86 | full_name='ModelInput', 87 | filename=None, 88 | file=DESCRIPTOR, 89 | containing_type=None, 90 | fields=[ 91 | _descriptor.FieldDescriptor( 92 | name='data_type', full_name='ModelInput.data_type', index=0, 93 | number=1, type=14, cpp_type=8, label=2, 94 | has_default_value=False, default_value=1, 95 | message_type=None, enum_type=None, containing_type=None, 96 | is_extension=False, extension_scope=None, 97 | serialized_options=None, file=DESCRIPTOR), 98 | _descriptor.FieldDescriptor( 99 | name='shape', full_name='ModelInput.shape', index=1, 100 | number=2, type=13, cpp_type=3, label=3, 101 | has_default_value=False, default_value=[], 102 | message_type=None, enum_type=None, containing_type=None, 103 | is_extension=False, extension_scope=None, 104 | serialized_options=None, file=DESCRIPTOR), 105 | _descriptor.FieldDescriptor( 106 | name='max_number', full_name='ModelInput.max_number', index=2, 107 | number=3, type=13, cpp_type=3, label=1, 108 | has_default_value=False, default_value=0, 109 | message_type=None, enum_type=None, containing_type=None, 110 | is_extension=False, extension_scope=None, 111 | serialized_options=None, file=DESCRIPTOR), 112 | ], 113 | extensions=[ 114 | ], 115 | nested_types=[], 116 | enum_types=[ 117 | ], 118 | serialized_options=None, 119 | is_extendable=False, 120 | syntax='proto2', 121 | extension_ranges=[], 122 | oneofs=[ 123 | ], 124 | serialized_start=18, 125 | serialized_end=95, 126 | ) 127 | 128 | 129 | _PLAN = _descriptor.Descriptor( 130 | name='Plan', 131 | full_name='Plan', 132 | filename=None, 133 | file=DESCRIPTOR, 134 | containing_type=None, 135 | fields=[ 136 | _descriptor.FieldDescriptor( 137 | name='plan_type', full_name='Plan.plan_type', index=0, 138 | number=1, type=14, cpp_type=8, label=2, 139 | has_default_value=False, default_value=1, 140 | message_type=None, enum_type=None, containing_type=None, 141 | is_extension=False, extension_scope=None, 142 | serialized_options=None, file=DESCRIPTOR), 143 | _descriptor.FieldDescriptor( 144 | name='load_layers', full_name='Plan.load_layers', index=1, 145 | number=2, type=13, cpp_type=3, label=3, 146 | has_default_value=False, default_value=[], 147 | message_type=None, enum_type=None, containing_type=None, 148 | is_extension=False, extension_scope=None, 149 | serialized_options=None, file=DESCRIPTOR), 150 | ], 151 | extensions=[ 152 | ], 153 | nested_types=[], 154 | enum_types=[ 155 | _PLAN_PLANTYPE, 156 | ], 157 | serialized_options=None, 158 | is_extendable=False, 159 | syntax='proto2', 160 | extension_ranges=[], 161 | oneofs=[ 162 | ], 163 | serialized_start=97, 164 | serialized_end=215, 165 | ) 166 | 167 | 168 | _MODELCONFIG = _descriptor.Descriptor( 169 | name='ModelConfig', 170 | full_name='ModelConfig', 171 | filename=None, 172 | file=DESCRIPTOR, 173 | containing_type=None, 174 | fields=[ 175 | _descriptor.FieldDescriptor( 176 | name='model_name', full_name='ModelConfig.model_name', index=0, 177 | number=1, type=9, cpp_type=9, label=2, 178 | has_default_value=False, default_value=b"".decode('utf-8'), 179 | message_type=None, enum_type=None, containing_type=None, 180 | is_extension=False, extension_scope=None, 181 | serialized_options=None, file=DESCRIPTOR), 182 | _descriptor.FieldDescriptor( 183 | name='inputs', full_name='ModelConfig.inputs', index=1, 184 | number=2, type=11, cpp_type=10, label=3, 185 | has_default_value=False, default_value=[], 186 | message_type=None, enum_type=None, containing_type=None, 187 | is_extension=False, extension_scope=None, 188 | serialized_options=None, file=DESCRIPTOR), 189 | _descriptor.FieldDescriptor( 190 | name='plans', full_name='ModelConfig.plans', index=2, 191 | number=3, type=11, cpp_type=10, label=3, 192 | has_default_value=False, default_value=[], 193 | message_type=None, enum_type=None, containing_type=None, 194 | is_extension=False, extension_scope=None, 195 | serialized_options=None, file=DESCRIPTOR), 196 | ], 197 | extensions=[ 198 | ], 199 | nested_types=[], 200 | enum_types=[ 201 | ], 202 | serialized_options=None, 203 | is_extendable=False, 204 | syntax='proto2', 205 | extension_ranges=[], 206 | oneofs=[ 207 | ], 208 | serialized_start=217, 209 | serialized_end=301, 210 | ) 211 | 212 | _MODELINPUT.fields_by_name['data_type'].enum_type = _DATATYPE 213 | _PLAN.fields_by_name['plan_type'].enum_type = _PLAN_PLANTYPE 214 | _PLAN_PLANTYPE.containing_type = _PLAN 215 | _MODELCONFIG.fields_by_name['inputs'].message_type = _MODELINPUT 216 | _MODELCONFIG.fields_by_name['plans'].message_type = _PLAN 217 | DESCRIPTOR.message_types_by_name['ModelInput'] = _MODELINPUT 218 | DESCRIPTOR.message_types_by_name['Plan'] = _PLAN 219 | DESCRIPTOR.message_types_by_name['ModelConfig'] = _MODELCONFIG 220 | DESCRIPTOR.enum_types_by_name['DataType'] = _DATATYPE 221 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 222 | 223 | ModelInput = _reflection.GeneratedProtocolMessageType('ModelInput', (_message.Message,), { 224 | 'DESCRIPTOR' : _MODELINPUT, 225 | '__module__' : 'deepplan_pb2' 226 | # @@protoc_insertion_point(class_scope:ModelInput) 227 | }) 228 | _sym_db.RegisterMessage(ModelInput) 229 | 230 | Plan = _reflection.GeneratedProtocolMessageType('Plan', (_message.Message,), { 231 | 'DESCRIPTOR' : _PLAN, 232 | '__module__' : 'deepplan_pb2' 233 | # @@protoc_insertion_point(class_scope:Plan) 234 | }) 235 | _sym_db.RegisterMessage(Plan) 236 | 237 | ModelConfig = _reflection.GeneratedProtocolMessageType('ModelConfig', (_message.Message,), { 238 | 'DESCRIPTOR' : _MODELCONFIG, 239 | '__module__' : 'deepplan_pb2' 240 | # @@protoc_insertion_point(class_scope:ModelConfig) 241 | }) 242 | _sym_db.RegisterMessage(ModelConfig) 243 | 244 | 245 | # @@protoc_insertion_point(module_scope) 246 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.36.0 2 | protobuf==3.20.1 3 | matplotlib==3.3.4 4 | -------------------------------------------------------------------------------- /scripts/create_all_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PLAN_REPO=${PLAN_REPO} 4 | 5 | if [[ -z "$PLAN_REPO" ]]; then 6 | echo "PLAN_REPO environment variable not set, please set this variable" 7 | return 8 | fi 9 | 10 | script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 11 | exec_path="$script_path/../" 12 | 13 | TARGET="plan.py" 14 | 15 | models=("resnet50" "resnet101" "bert_base" "bert_large" "roberta_base" "roberta_large" "gpt2" "gpt2_384" "gpt2_medium") 16 | 17 | if [ ! -d "$PLAN_REPO" ]; then 18 | mkdir -p "$PLAN_REPO" 19 | echo "Create $PLAN_REPO directory" 20 | fi 21 | 22 | for model in ${models[@]}; do 23 | cmd="python3 $exec_path/$TARGET -m $model -p $PLAN_REPO --trace --profile" 24 | $cmd 25 | done 26 | -------------------------------------------------------------------------------- /scripts/download_azure_trace_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | output_dir="$script_path/azure-functions" 4 | 5 | echo "Downloading Azure trace dataset" 6 | wget https://azurecloudpublicdataset2.blob.core.windows.net/azurepublicdatasetv2/azurefunctions_dataset2019/azurefunctions-dataset2019.tar.xz 7 | 8 | mkdir -p $output_dir 9 | 10 | echo "Extract azurefunctions-dataset2019.tar.xz" 11 | tar -xvf azurefunctions-dataset2019.tar.xz -C $output_dir 12 | 13 | echo "The Azure trace datasets are saved to '$outpu_dir'" 14 | echo "To run azure experiments, follow the command below" 15 | echo "export AZURE_TRACE_DIR=\"$output_dir\"" 16 | 17 | -------------------------------------------------------------------------------- /scripts/fig10/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from mpl_toolkits.axes_grid1 import make_axes_locatable 9 | from matplotlib.ticker import MaxNLocator 10 | import sys 11 | import os 12 | import csv 13 | 14 | baseline = np.array([]) 15 | pipeswitch = np.array([]) 16 | deepplan_dha = np.array([]) 17 | deepplan_parallel = np.array([]) 18 | deepplan_all = np.array([]) 19 | 20 | target = sys.argv[1] 21 | target = target.strip() 22 | if target[0] != '/': 23 | target = os.path.join(os.getcwd(), target) 24 | 25 | def read_file(file): 26 | baseline = np.array([]) 27 | pipeswitch = np.array([]) 28 | deepplan_dha = np.array([]) 29 | deepplan_parallel = np.array([]) 30 | deepplan_all = np.array([]) 31 | 32 | with open(file, 'r', encoding='utf-8') as f: 33 | rdr = csv.reader(f) 34 | for i, line in enumerate(rdr): 35 | baseline = np.append(baseline, float(line[0])) 36 | pipeswitch = np.append(pipeswitch, float(line[1])) 37 | deepplan_dha = np.append(deepplan_dha, float(line[2])) 38 | deepplan_parallel = np.append(deepplan_parallel, float(line[3])) 39 | deepplan_all = np.append(deepplan_all, float(line[4])) 40 | 41 | return np.array([baseline, pipeswitch, deepplan_dha, deepplan_parallel, deepplan_all]) 42 | 43 | pipeswitch = baseline / pipeswitch 44 | deepplan_dha = baseline / deepplan_dha 45 | deepplan_parallel = baseline / deepplan_parallel 46 | deepplan_all = baseline / deepplan_all 47 | baseline /= baseline 48 | 49 | label_list = ["Baseline", "PipeSwitch", "DeepPlan (DHA)", "DeepPlan (PT)", "DeepPlan (PT+DHA)"] 50 | 51 | color_list = ['#EAECEE', '#AEB6BF', '#85929E', '#5D6D7E', '#34495E', '#273746'] 52 | model_list = ["ResNet-50", "ResNet-101", "BERT-Base", "BERT-Large", "RoBERTa\nBase", "RoBERTa\nLarge", "GPT-2", "GPT-2 Medium"] 53 | 54 | x_label = "" 55 | y_label = "Inference speedup" 56 | 57 | FONTSIZE_LABEL = 14 58 | FONTSIZE_LEGEND = 14 59 | WIDTH = 1.1 60 | SIZE_FIGURE = (12, 3) 61 | 62 | 63 | def create_x(t, w, n, d): 64 | return [t*x + w*n for x in range(d)] 65 | 66 | value_base = create_x(8, 1.3, 1, 8) 67 | value_pipe = create_x(8, 1.3, 2, 8) 68 | value_deep_dha = create_x(8, 1.3, 3, 8) 69 | value_deep_parallel = create_x(8, 1.3, 4, 8) 70 | value_deep_all = create_x(8, 1.3, 5, 8) 71 | 72 | fig, ax = plt.subplots(1, 1, figsize=SIZE_FIGURE) 73 | 74 | avg_ret = read_file(sys.argv[1]) 75 | min_ret = read_file(sys.argv[2]) 76 | max_ret = read_file(sys.argv[3]) 77 | 78 | base = avg_ret[0] 79 | avg_ret = base / avg_ret 80 | min_ret = base / min_ret 81 | max_ret = base / max_ret 82 | 83 | lower_err = abs(avg_ret - min_ret) 84 | upper_err = abs(avg_ret - max_ret) 85 | 86 | ax.bar(value_base, avg_ret[0], color=color_list[0], edgecolor="black", zorder=3, width=WIDTH) 87 | ax.bar(value_pipe, avg_ret[1], color=color_list[1], edgecolor="black", zorder=3, width=WIDTH) 88 | ax.bar(value_deep_dha, avg_ret[2], color=color_list[2], edgecolor="black", zorder=3, width=WIDTH) 89 | ax.bar(value_deep_parallel, avg_ret[3], color=color_list[3], edgecolor="black", zorder=3, width=WIDTH) 90 | ax.bar(value_deep_all, avg_ret[4], color=color_list[4], edgecolor="black", zorder=3, width=WIDTH) 91 | 92 | ax.errorbar(value_base, avg_ret[0], yerr=[lower_err[0], upper_err[0]], fmt='o', capsize=3, color="black", zorder=4, ms=1) 93 | ax.errorbar(value_pipe, avg_ret[1], yerr=[lower_err[1], upper_err[1]], fmt='o', capsize=3, color="black", zorder=4, ms=1) 94 | ax.errorbar(value_deep_dha, avg_ret[2], yerr=[lower_err[2], upper_err[2]], fmt='o', capsize=3, color="black", zorder=4, ms=1) 95 | ax.errorbar(value_deep_parallel, avg_ret[3], yerr=[lower_err[3], upper_err[3]], fmt='o', capsize=3, color="black", zorder=4, ms=1) 96 | ax.errorbar(value_deep_all, avg_ret[4], yerr=[lower_err[4], upper_err[4]], fmt='o', capsize=3, color="black", zorder=4, ms=1) 97 | 98 | fig.legend(labels=label_list, bbox_to_anchor=(0.52, 1.00), ncol=5, loc='center', 99 | fontsize=FONTSIZE_LEGEND, frameon=False) 100 | 101 | plt.xticks([3.9 + i * 8 for i in range(0, 8)], model_list) 102 | plt.tick_params(axis="x", direction="out", labelsize=FONTSIZE_LABEL, rotation=0) 103 | plt.ylabel(y_label, fontsize=FONTSIZE_LABEL, labelpad=8) 104 | plt.yticks(fontsize=FONTSIZE_LABEL) 105 | plt.grid(linestyle='-', axis='y', zorder=-10) 106 | plt.rcParams["font.family"] = "Helvetica" 107 | plt.axhline(y=1.0, color='gray', linestyle='--') 108 | 109 | plt.tight_layout() 110 | #plt.show() 111 | plt.savefig(sys.argv[4], bbox_inches="tight", pad_inches=0.0) 112 | print("Saved graph to {}".format(sys.argv[4])) 113 | -------------------------------------------------------------------------------- /scripts/fig10/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PLAN_REPO=${PLAN_REPO} 4 | 5 | if [[ -z "$PLAN_REPO" ]]; then 6 | echo "PLAN_REPO environment variable not set, please set this variable" 7 | return 8 | fi 9 | 10 | export PLAN_REPO=${PLAN_REPO} 11 | 12 | script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 13 | build_path="$script_path/../../build" 14 | 15 | TARGET="benchmark" 16 | 17 | device_maps=("0" "0 2") 18 | models=("resnet50" "resnet101" "bert_base" "bert_large" "roberta_base" "roberta_large" "gpt2" "gpt2_medium") 19 | engines=("pipeline" "deepplan") 20 | batch_size=1 21 | 22 | tmp_avg_file="/tmp/deepplan_fig10_avg" 23 | tmp_min_file="/tmp/deepplan_fig10_min" 24 | tmp_max_file="/tmp/deepplan_fig10_max" 25 | 26 | printf "" > $tmp_avg_file 27 | printf "" > $tmp_min_file 28 | printf "" > $tmp_max_file 29 | 30 | for model in "${models[@]}"; do 31 | # Baseline 32 | baseline_cmd="$build_path/$TARGET -m $model -e demand -b $batch_size -d 0" 33 | echo "Run $baseline_cmd" 34 | 35 | output=`$baseline_cmd` 36 | echo "$output" 37 | echo "" 38 | 39 | avg_lat=$(echo "$output" | awk '{if ($1 == "Average") { print $(NF-1)}}') 40 | min_lat=$(echo "$output" | awk '{if ($1 == "Min") { print $(NF-1)}}') 41 | max_lat=$(echo "$output" | awk '{if ($1 == "Max") { print $(NF-1)}}') 42 | 43 | printf "$avg_lat, " >> $tmp_avg_file 44 | printf "$min_lat, " >> $tmp_min_file 45 | printf "$max_lat, " >> $tmp_max_file 46 | 47 | for device_map in "${device_maps[@]}"; do 48 | for engine in "${engines[@]}"; do 49 | cmd="$build_path/$TARGET -m $model -e $engine -b $batch_size -d $device_map" 50 | echo "Run $cmd" 51 | 52 | output=`$cmd` 53 | echo "$output" 54 | echo "" 55 | 56 | avg_lat=$(echo "$output" | awk '{if ($1 == "Average") { print $(NF-1)}}') 57 | min_lat=$(echo "$output" | awk '{if ($1 == "Min") { print $(NF-1)}}') 58 | max_lat=$(echo "$output" | awk '{if ($1 == "Max") { print $(NF-1)}}') 59 | 60 | printf "$avg_lat, " >> $tmp_avg_file 61 | printf "$min_lat, " >> $tmp_min_file 62 | printf "$max_lat, " >> $tmp_max_file 63 | done 64 | done 65 | 66 | echo "" >> $tmp_avg_file 67 | echo "" >> $tmp_min_file 68 | echo "" >> $tmp_max_file 69 | 70 | done 71 | 72 | log_path="$script_path/logs" 73 | 74 | # Check for log_path existence 75 | if [ ! -d "$log_path" ]; then 76 | echo "Created $log_path directory where log files will be stored" 77 | mkdir -p $log_path 78 | fi 79 | 80 | date=`date +%y-%m-%d` 81 | 82 | log_path="$log_path/$date" 83 | if [ ! -d "$log_path" ]; then 84 | mkdir -p "$log_path" 85 | fi 86 | 87 | output_path="$log_path/report" 88 | 89 | version=0 90 | while true; do 91 | _output="${output_path}$version" 92 | if [ -d "$_output" ]; then 93 | ((version++)) 94 | else 95 | break 96 | fi 97 | done 98 | 99 | output_path="$_output" 100 | 101 | mkdir -p $output_path 102 | 103 | avg_file="$output_path/result_avg.csv" 104 | min_file="$output_path/result_min.csv" 105 | max_file="$output_path/result_max.csv" 106 | 107 | cp $tmp_avg_file $avg_file 108 | cp $tmp_min_file $min_file 109 | cp $tmp_max_file $max_file 110 | 111 | echo "Created log files in '$output_path'" 112 | 113 | is_installed=$(pip list | grep -F matplotlib) 114 | 115 | if [ -z "$is_installed" ]; then 116 | echo "Matplotlib is not installed. So the graph can not be created." 117 | else 118 | eval "python3 graph.py $avg_file $min_file $max_file fig10.pdf" 119 | fi 120 | -------------------------------------------------------------------------------- /scripts/fig12/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from mpl_toolkits.axes_grid1 import make_axes_locatable 9 | from matplotlib import gridspec 10 | import sys 11 | import os 12 | import csv 13 | 14 | def get_data(target): 15 | target = "{}/bert_base_{}".format(sys.argv[1], target) 16 | target = target.strip() 17 | if target[0] != '/': 18 | target = os.path.join(os.getcwd(), target) 19 | 20 | latency = np.array([]) 21 | goodput = np.array([]) 22 | cold = np.array([]) 23 | 24 | result = [] 25 | 26 | with open(target, 'r', encoding='utf-8') as f: 27 | rdr = csv.reader(f) 28 | for i, line in enumerate(rdr): 29 | latency = np.append(latency, float(line[0])) 30 | goodput = np.append(goodput, float(line[1])) 31 | cold = np.append(cold, float(line[2])) 32 | 33 | result.append(latency) 34 | result.append(goodput) 35 | result.append(cold) 36 | 37 | return result 38 | 39 | x_value = [20 * i for i in range(1, 11)] 40 | 41 | 42 | label_list = ["PipeSwitch", "DeepPlan (DHA)", "DeepPlan (PT+DHA)"] 43 | # color_list = ['#AEB6BF', '#5D6D7E', '#34495E', '#273746', '#273746'] 44 | color_list = ['#AEB6BF', '#5D6D7E', '#273746'] 45 | marker_list = ['o', '^', 'P'] 46 | line_list = ['-', 'dotted', 'dashed'] 47 | 48 | # Prepare these files 49 | engine_list = ["pipeline.csv", "deepplan.csv", "deepplan+.csv"] 50 | 51 | ylim_list = { 52 | "bert_base": [(0, 300), (30, 105), (0, 60)], 53 | "bert_large": [], 54 | "roberta_base": [], 55 | "roberta_large": [], 56 | "gpt2": [], 57 | "gpt2_medium": []} 58 | 59 | 60 | x_label = "# of model instances (concurrency)" 61 | y_label = ["99 % latency (ms)", "Goodput (%)", "Cold-start (%)"] 62 | 63 | FONTSIZE_LABEL = 16 64 | FONTSIZE_TICK = 13 65 | FONTSIZE_LEGEND = 14 66 | SIZE_FIGURE = (7, 7) 67 | LINE_WIDTH = 3 68 | ARKER_SIZE = 10 69 | MARKER_SIZE = 10 70 | 71 | 72 | plt.figure(figsize=SIZE_FIGURE) 73 | gs = gridspec.GridSpec(nrows=3, 74 | ncols=1, 75 | height_ratios=[1, 0.8, 0.8] 76 | ) 77 | 78 | li_ax = [] 79 | for i in range(0, 3): 80 | li_ax.append(plt.subplot(gs[i])) 81 | 82 | for i, engine in enumerate(engine_list): 83 | result = get_data(engine) 84 | 85 | for j, ax in enumerate(li_ax): 86 | 87 | ax.plot(x_value, result[j], linewidth = LINE_WIDTH, color=color_list[i], marker=marker_list[i], linestyle=line_list[i], markersize=MARKER_SIZE) 88 | 89 | ax.set_ylim(ylim_list["bert_base"][j]) 90 | ax.tick_params(axis='both', labelsize=FONTSIZE_TICK) 91 | 92 | ax.set_xticks(x_value) 93 | if j < 2: 94 | ax.axes.xaxis.set_ticklabels([]) 95 | ax.set_ylabel(y_label[j], fontsize=FONTSIZE_LABEL, labelpad=10) 96 | else: 97 | ax.set_ylabel(y_label[j], fontsize=FONTSIZE_LABEL, labelpad=18) 98 | 99 | 100 | if j == 0: 101 | ax.axhline(y=100, color='gray', linestyle='--') 102 | ax.text(20, 150, "Target SLO", fontsize=FONTSIZE_TICK) 103 | 104 | ax.grid(alpha=0.5, linestyle='--') 105 | 106 | plt.legend(labels=label_list, bbox_to_anchor=(0.43, 3.55), ncol=3, loc='center', columnspacing=0.6, 107 | fontsize=FONTSIZE_LEGEND, edgecolor="#FFFFFF") 108 | 109 | plt.xlabel(x_label, fontsize=FONTSIZE_LABEL, labelpad=10) 110 | 111 | plt.subplots_adjust(hspace=0.06) 112 | plt.rcParams["font.family"] = "Helvetica" 113 | plt.savefig(sys.argv[2], bbox_inches="tight", pad_inches=0.0) 114 | 115 | print("Saved graph to {}".format(sys.argv[2])) 116 | -------------------------------------------------------------------------------- /scripts/fig12/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PLAN_REPO=${PLAN_REPO} 4 | if [[ -z "$PLAN_REPO" ]]; then 5 | echo "PLAN_REPO environment variable not set, please set this variable" 6 | return 7 | fi 8 | 9 | export PLAN_REPO=${PLAN_REPO} 10 | 11 | script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 12 | build_path="$script_path/../../build" 13 | 14 | TARGET="client" 15 | 16 | model_name="bert_base" 17 | min_conc=20 18 | max_conc=200 19 | step_conc=20 20 | rate=100 21 | 22 | engines=("deepplan+" "deepplan" "pipeline") 23 | 24 | server_cmd="$build_path/server" 25 | 26 | echo "Run Server" 27 | $server_cmd & 1> /dev/null 28 | 29 | echo "Wait 30 seconds for the server to be ready." 30 | sleep 30 31 | 32 | for engine in "${engines[@]}"; do 33 | p_option=1 34 | 35 | _engine=$engine 36 | if [ "$engine" = "deepplan+" ]; then 37 | _engine="deepplan" 38 | p_option=2 39 | fi 40 | 41 | tmp_file="/tmp/deepplan_${engine}_fig12" 42 | printf "" > $tmp_file 43 | 44 | echo "Model Setup" 45 | client_cmd="$build_path/client -m $model_name -e $_engine -r $rate -c $max_conc -w simple -p $p_option" 46 | $client_cmd 1> /dev/null 47 | 48 | echo "Start Experiment ($engine)" 49 | for ((c=$min_conc; c<=$max_conc; c+=$step_conc)); do 50 | echo "== Concurrency $c ==" 51 | client_cmd="$build_path/client -m $model_name -e $_engine -r $rate -c $c -w simple -p $p_option" 52 | output=`$client_cmd` 53 | 54 | latency=$(echo "$output" | awk '{if ($2 == "Latency:") { print $(NF-1)}}') 55 | goodput_rate=$(echo "$output" | awk '{if ($1 == "Goodput") { print $(NF-1)}}') 56 | cold_rate=$(echo "$output" | awk '{if ($1 == "Cold") { print $(NF-1)}}') 57 | echo "$output" 58 | printf "$latency, $goodput_rate, $cold_rate" >> $tmp_file 59 | echo "" >> $tmp_file 60 | done 61 | 62 | done 63 | 64 | log_path="$script_path/logs" 65 | 66 | # Check for log_path existence 67 | if [ ! -d "$log_path" ]; then 68 | mkdir -p $log_path 69 | echo "Created $log_path directory where log files will be stored" 70 | fi 71 | 72 | date=`date +%y-%m-%d` 73 | 74 | log_path="$log_path/$date/report" 75 | 76 | version=0 77 | while true; do 78 | _log_path="${log_path}$version" 79 | if [ -d "$_log_path" ]; then 80 | ((version++)) 81 | else 82 | break 83 | fi 84 | done 85 | 86 | log_path=$_log_path 87 | mkdir -p "$log_path" 88 | 89 | for engine in "${engines[@]}"; do 90 | tmp_file="/tmp/deepplan_${engine}_fig12" 91 | 92 | output_file="$log_path/${model_name}_${engine}.csv" 93 | 94 | cp $tmp_file $output_file 95 | done 96 | 97 | server_pid=$(ps -ef | grep -v grep | grep "$server_cmd" | awk '{print $2}') 98 | kill -s SIGINT $server_pid 99 | 100 | echo "Closing Server" 101 | 102 | wait 103 | 104 | is_installed=$(pip list | grep -F matplotlib) 105 | 106 | if [ -z "$is_installed" ]; then 107 | echo "Matplotlib is not installed. So the graph can not be created." 108 | else 109 | eval "python3 graph.py $log_path fig12.pdf" 110 | fi 111 | -------------------------------------------------------------------------------- /scripts/fig13/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from mpl_toolkits.axes_grid1 import make_axes_locatable 10 | import sys 11 | import os 12 | import csv 13 | 14 | def get_data(model, engine): 15 | target = "{}/{}_{}.csv".format(sys.argv[1], model, engine) 16 | target = target.strip() 17 | if target[0] != '/': 18 | target = os.path.join(os.getcwd(), target) 19 | 20 | lat = np.array([]) 21 | 22 | with open(target, 'r', encoding='utf-8') as f: 23 | rdr = csv.reader(f) 24 | for i, line in enumerate(rdr): 25 | lat = np.append(lat, float(line[0])) 26 | 27 | return lat 28 | 29 | 30 | label_list = ["PipeSwitch", "DeepPlan (DHA)", "DeepPlan (PT+DHA)"] 31 | #color_list = ['#AEB6BF', '#5D6D7E', '#34495E', '#273746', '#273746'] 32 | color_list = ['#AEB6BF', '#5D6D7E', '#273746'] 33 | model_list = ["bert_large", "gpt2"] 34 | engine_list = ["pipeline", "deepplan", "deepplan+"] 35 | marker_list = ['o', '^', 'P'] 36 | line_list = ['-', 'dotted', 'dashed'] 37 | 38 | x_value_list = { 39 | "bert_large": [5 * i for i in range(1, 12)], 40 | "gpt2": [20 * i for i in range(1, 11)], 41 | } 42 | 43 | ylim_list = { # graph 모양 확인하고 조절해야함. 44 | "bert_base": [], 45 | "bert_large": [0, 850], 46 | "roberta_base": [], 47 | "roberta_large": [], 48 | "gpt2": [0, 900], 49 | "gpt2_medium": [] 50 | } 51 | 52 | 53 | x_label = "# of model instances (concurrency)" 54 | y_label = "99 % latency (ms)" 55 | 56 | FONTSIZE_LABEL = 16 57 | FONTSIZE_TICK = 15 58 | FONTSIZE_LEGEND = 14 59 | SIZE_FIGURE = (7, 7) 60 | LINE_WIDTH = 3 61 | MARKER_SIZE = 10 62 | 63 | plt.figure(figsize=SIZE_FIGURE) 64 | 65 | li_ax = [] 66 | for i in range(1, len(model_list) + 1): 67 | li_ax.append(plt.subplot(len(model_list), 1, i)) 68 | 69 | for i, model in enumerate(model_list): 70 | 71 | graph_title = "" 72 | if model_list[i] == "bert_large": 73 | graph_title = "BERT-Large" 74 | 75 | elif model_list[i] == "gpt2": 76 | graph_title = "GPT-2" 77 | 78 | for j, engine in enumerate(engine_list): 79 | result = get_data(model, engine) 80 | 81 | li_ax[i].plot(x_value_list[model_list[i]], result, linewidth = LINE_WIDTH, color=color_list[j], marker=marker_list[j], linestyle=line_list[j], markersize=MARKER_SIZE) 82 | 83 | li_ax[i].set_title(graph_title, fontsize=FONTSIZE_LABEL+2) 84 | li_ax[i].set_xticks(x_value_list[model_list[i]]) #, fontsize=FONTSIZE_TICK) 85 | li_ax[i].set_ylim(ylim_list[model_list[i]]) 86 | 87 | li_ax[i].set_ylabel(y_label, fontsize=FONTSIZE_LABEL, labelpad=10) 88 | li_ax[i].tick_params(which="major", labelsize=FONTSIZE_TICK) 89 | li_ax[i].grid(alpha=0.5, linestyle='--') 90 | 91 | 92 | plt.legend(labels=label_list, bbox_to_anchor=(0.45, 2.60), ncol=3, loc='center', columnspacing=0.5, 93 | fontsize=FONTSIZE_LEGEND, edgecolor="#FFFFFF") 94 | 95 | plt.xlabel(x_label, fontsize=FONTSIZE_LABEL, labelpad=10) 96 | 97 | plt.subplots_adjust(hspace=0.35) 98 | plt.rcParams["font.family"] = "Helvetica" 99 | plt.savefig(sys.argv[2], bbox_inches="tight", pad_inches=0.0) 100 | print("Saved graph to {}".format(sys.argv[2])) 101 | -------------------------------------------------------------------------------- /scripts/fig13/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PLAN_REPO=${PLAN_REPO} 4 | 5 | if [[ -z "$PLAN_REPO" ]]; then 6 | echo "PLAN_REPO environment variable not set, please set this variable" 7 | return 8 | fi 9 | 10 | export PLAN_REPO=${PLAN_REPO} 11 | 12 | script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd 13 | ) build_path="$script_path/../../build" 14 | 15 | TARGET="client" 16 | 17 | engines=("deepplan+" "deepplan" "pipeline") 18 | 19 | server_cmd="$build_path/server" 20 | 21 | echo "Run Server" 22 | $server_cmd & 1> /dev/null 23 | 24 | echo "Wait 30 seconds for the server to be ready." 25 | sleep 30 26 | 27 | model_name="bert_large" min_conc=5 max_conc=55 step_conc=5 rate=30 28 | 29 | for engine in "${engines[@]}"; do p_option=1 30 | 31 | _engine=$engine 32 | if [ "$engine" = "deepplan+" ]; then 33 | _engine="deepplan" 34 | p_option=2 35 | fi 36 | 37 | tmp_file="/tmp/deepplan_${model_name}_${engine}_fig13" 38 | printf "" > $tmp_file 39 | 40 | echo "Model Setup" 41 | client_cmd="$build_path/client -m $model_name -e $_engine -r $rate -c 60 -w simple -p $p_option" 42 | $client_cmd 1> /dev/null 43 | 44 | echo "Start Experiment ($engine)" 45 | for ((c=$min_conc; c<=$max_conc; c+=$step_conc)); do 46 | echo "== Concurrency $c ==" 47 | client_cmd="$build_path/client -m $model_name -e $_engine -r $rate -c $c -w simple -p $p_option" 48 | output=`$client_cmd` 49 | 50 | latency=$(echo "$output" | awk '{if ($2 == "Latency:") { print $(NF-1)}}') 51 | goodput_rate=$(echo "$output" | awk '{if ($1 == "Goodput") { print $(NF-1)}}') 52 | cold_rate=$(echo "$output" | awk '{if ($1 == "Cold") { print $(NF-1)}}') 53 | echo "$output" 54 | printf "$latency, $goodput_rate, $cold_rate" >> $tmp_file 55 | echo "" >> $tmp_file 56 | 57 | done 58 | done 59 | 60 | server_pid=$(ps -ef | grep -v grep | grep "$server_cmd" | awk '{print $2}') 61 | kill -s SIGINT $server_pid 62 | 63 | echo "Closing Server" 64 | 65 | wait 66 | 67 | echo "Run Server" 68 | $server_cmd & 1> /dev/null 69 | 70 | echo "Wait 30 seconds for the server to be ready." 71 | sleep 30 72 | 73 | model_name="gpt2" 74 | min_conc=20 75 | max_conc=200 76 | step_conc=20 77 | rate=90 78 | 79 | for engine in "${engines[@]}"; do 80 | p_option=1 81 | 82 | _engine=$engine 83 | if [ "$engine" = "deepplan+" ]; then 84 | _engine="deepplan" 85 | p_option=2 86 | fi 87 | 88 | tmp_file="/tmp/deepplan_${model_name}_${engine}_fig13" 89 | printf "" > $tmp_file 90 | 91 | echo "Model Setup" 92 | client_cmd="$build_path/client -m $model_name -e $_engine -r $rate -c $max_conc -w simple -p $p_option" 93 | $client_cmd 1> /dev/null 94 | 95 | echo "Start Experiment ($engine)" 96 | for ((c=$min_conc; c<=$max_conc; c+=$step_conc)); do 97 | echo "== Concurrency $c ==" 98 | client_cmd="$build_path/client -m $model_name -e $_engine -r $rate -c $c -w simple -p $p_option -s 200" 99 | output=`$client_cmd` 100 | 101 | latency=$(echo "$output" | awk '{if ($2 == "Latency:") { print $(NF-1)}}') 102 | goodput_rate=$(echo "$output" | awk '{if ($1 == "Goodput") { print $(NF-1)}}') 103 | cold_rate=$(echo "$output" | awk '{if ($1 == "Cold") { print $(NF-1)}}') 104 | echo "$output" 105 | printf "$latency, $goodput_rate, $cold_rate" >> $tmp_file 106 | echo "" >> $tmp_file 107 | done 108 | 109 | 110 | done 111 | 112 | server_pid=$(ps -ef | grep -v grep | grep "$server_cmd" | awk '{print $2}') 113 | kill -s SIGINT $server_pid 114 | 115 | echo "Closing Server" 116 | 117 | wait 118 | 119 | log_path="$script_path/logs" 120 | 121 | # Check for log_path existence 122 | if [ ! -d "$log_path" ]; then 123 | mkdir -p $log_path 124 | echo "Created $log_path directory where log files will be stored" 125 | fi 126 | 127 | date=`date +%y-%m-%d` 128 | 129 | log_path="$log_path/$date/report" 130 | 131 | version=0 132 | while true; do 133 | _log_path="${log_path}$version" 134 | if [ -d "$_log_path" ]; then 135 | ((version++)) 136 | else 137 | break 138 | fi 139 | done 140 | 141 | log_path=$_log_path 142 | mkdir -p "$log_path" 143 | 144 | model_names=("bert_large" "gpt2") 145 | 146 | for model in "${model_names[@]}"; do 147 | for engine in "${engines[@]}"; do 148 | tmp_file="/tmp/deepplan_${model}_${engine}_fig13" 149 | 150 | output_file="$log_path/${model}_${engine}.csv" 151 | 152 | cp $tmp_file $output_file 153 | 154 | echo "Created '$output_file' log file" 155 | done 156 | done 157 | 158 | is_installed=$(pip list | grep -F matplotlib) 159 | 160 | if [ -z "$is_installed" ]; then 161 | echo "Matplotlib is not installed. So the graph can not be created." 162 | else 163 | eval "python3 graph.py $log_path fig13.pdf" 164 | fi 165 | -------------------------------------------------------------------------------- /scripts/fig14/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from mpl_toolkits.axes_grid1 import make_axes_locatable 10 | from matplotlib import gridspec 11 | import sys 12 | import os 13 | import csv 14 | 15 | def get_data(target): 16 | target = "{}/{}".format(sys.argv[1], target) 17 | target = target.strip() 18 | if target[0] != '/': 19 | target = os.path.join(os.getcwd(), target) 20 | 21 | if "offered" in target: 22 | offered_load = np.array([]) 23 | else : 24 | result = [] 25 | 26 | latency = np.array([]) 27 | goodput = np.array([]) 28 | cold = np.array([]) 29 | 30 | with open(target, 'r', encoding='utf-8') as f: 31 | rdr = csv.reader(f) 32 | if "offered" in target: 33 | for i, line in enumerate(rdr): 34 | offered_load = np.append(offered_load, int(line[0])) 35 | return offered_load 36 | 37 | else: 38 | for i, line in enumerate(rdr): 39 | latency = np.append(latency, float(line[0])) 40 | cold = np.append(cold, float(line[1])) 41 | goodput = np.append(goodput, float(line[2])) 42 | 43 | result.append(latency) 44 | result.append(goodput) 45 | result.append(cold) 46 | 47 | return result 48 | 49 | 50 | 51 | x_value = [i for i in range(1, 181)] 52 | x_ticks = [30 * i for i in range(0, 7)] 53 | 54 | 55 | label_list = ["PipeSwitch", "DeepPlan (DHA)", "DeepPlan (PT+DHA)"] 56 | #color_list = ['#EAECEE', '#AEB6BF', '#85929E', '#5D6D7E', '#34495E', '#273746'] 57 | 58 | # Prepare these files 59 | engine_list = ["pipeline.csv", "deepplan.csv", "deepplan+.csv"] 60 | 61 | color_list = ['#AEB6BF', '#5D6D7E', '#273746'] 62 | line_list = ['solid', 'dotted', 'dashdot'] 63 | 64 | ylim_list = [(5500, 10001), (0, 550), (50, 103), (0, 22.5)] 65 | 66 | x_label = "Time (minutes)" 67 | y_label = ["Offered load\n (req./min.)", "99 % latency\n (ms)", "Goodput\n (%)", "Cold-start\n (%)"] 68 | 69 | FONTSIZE_XLABEL = 16 70 | FONTSIZE_YLABEL = 14 71 | FONTSIZE_TICK = 13 72 | FONTSIZE_LEGEND = 14 73 | SIZE_FIGURE = (7, 7) 74 | LINE_WIDTH = 1.5 75 | ARKER_SIZE = 10 76 | MARKER_SIZE = 10 77 | 78 | 79 | plt.figure(figsize=SIZE_FIGURE) 80 | gs = gridspec.GridSpec(nrows=4, # row 몇 개 81 | ncols=1, # col 몇 개 82 | height_ratios=[0.8, 1, 0.8, 0.8] 83 | ) 84 | 85 | li_ax = [] 86 | for i in range(0, 4): 87 | li_ax.append(plt.subplot(gs[i])) 88 | 89 | if i == 0: # Offered Load graph 90 | offered_load = get_data("offered_load.csv") 91 | 92 | li_ax[i].plot(x_value, offered_load, linewidth = LINE_WIDTH, color='#000000', linestyle="solid") 93 | li_ax[i].set_ylim(ylim_list[i]) 94 | li_ax[i].tick_params(axis="both", labelsize=FONTSIZE_TICK) 95 | li_ax[i].set_xticks(x_ticks) 96 | 97 | li_ax[i].axes.xaxis.set_ticklabels([]) 98 | 99 | li_ax[i].set_ylabel(y_label[i], fontsize=FONTSIZE_YLABEL) 100 | li_ax[i].get_yaxis().set_label_coords(-0.13, 0.5) 101 | 102 | li_ax[i].set_xlim(0, 180) 103 | 104 | li_ax[i].grid(alpha=1, linestyle='--') 105 | 106 | 107 | for i, engine in enumerate(engine_list): 108 | result = get_data(engine) # Read data 109 | for j, ax in enumerate(li_ax): 110 | if j > 0: 111 | ax.plot(x_value, result[j-1], linewidth = LINE_WIDTH, color=color_list[i], linestyle=line_list[i], markersize=MARKER_SIZE) 112 | 113 | ax.set_ylim(ylim_list[j]) 114 | ax.tick_params(axis='both', labelsize=FONTSIZE_TICK) 115 | 116 | ax.set_xticks(x_ticks) 117 | if j < 3: 118 | ax.axes.xaxis.set_ticklabels([]) 119 | 120 | ax.set_ylabel(y_label[j], fontsize=FONTSIZE_YLABEL) 121 | ax.get_yaxis().set_label_coords(-0.13, 0.5) 122 | 123 | ax.set_xlim(0, 180) 124 | 125 | ax.grid(alpha=1, linestyle='--') 126 | 127 | plt.legend(labels=label_list, bbox_to_anchor=(0.43, 4.7), ncol=3, loc='center', columnspacing=0.6, 128 | fontsize=FONTSIZE_LEGEND, edgecolor="#FFFFFF") 129 | 130 | plt.xlabel(x_label, fontsize=FONTSIZE_XLABEL, labelpad=10) 131 | 132 | plt.subplots_adjust(hspace=0.06) 133 | plt.rcParams["font.family"] = "Helvetica" 134 | plt.savefig(sys.argv[2], bbox_inches="tight", pad_inches=0.0) 135 | 136 | print("Saved graph to {}".format(sys.argv[2])) 137 | -------------------------------------------------------------------------------- /scripts/fig14/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PLAN_REPO=${PLAN_REPO} 4 | if [[ -z "$PLAN_REPO" ]]; then 5 | echo "PLAN_REPO environment variable not set, please set this variable" 6 | return 7 | fi 8 | 9 | AZURE_TRACE_DIR=${AZURE_TRACE_DIR} 10 | if [[ -z "$AZURE_TRACE_DIR" ]]; then 11 | echo "AZURE_TRACE_DIR environment variable not set, please set thie variable" 12 | return 13 | fi 14 | 15 | export PLAN_REPO=${PLAN_REPO} 16 | 17 | script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 18 | build_path="$script_path/../../build" 19 | 20 | model_list="gpt2_384 bert_base bert_base bert_base bert_base roberta_base roberta_base roberta_base roberta_base" 21 | conc=252 22 | rate=150 23 | 24 | engines=("deepplan+" "deepplan" "pipeline") 25 | 26 | for engine in "${engines[@]}"; do 27 | server_cmd="$build_path/server" 28 | 29 | echo "Run Server" 30 | $server_cmd & 1> /dev/null 31 | 32 | echo "Wait 30 seconds for the server to be ready." 33 | sleep 30 34 | 35 | p_option=1 36 | 37 | _engine=$engine 38 | if [ "$engine" = "deepplan+" ]; then 39 | _engine="deepplan" 40 | p_option=2 41 | fi 42 | 43 | tmp_file="/tmp/deepplan_${engine}_fig14" 44 | printf "" > $tmp_file 45 | 46 | echo "Start Experiment ($engine)" 47 | client_cmd="$build_path/client -m $model_list -e $_engine -r $rate -c $conc -w azure -p $p_option" 48 | stdbuf --output=L $client_cmd | tee -a $tmp_file 49 | 50 | server_pid=$(ps -ef | grep -v grep | grep "$server_cmd" | awk '{print $2}') 51 | kill -s SIGINT $server_pid 52 | 53 | echo "Closing Server" 54 | 55 | wait 56 | 57 | done 58 | 59 | log_path="$script_path/logs" 60 | 61 | # Check for log_path existence 62 | if [ ! -d "$log_path" ]; then 63 | mkdir -p $log_path 64 | echo "Created $log_path directory where log files will be stored" 65 | fi 66 | 67 | date=`date +%y-%m-%d` 68 | 69 | log_path="$log_path/$date/report" 70 | 71 | version=0 72 | while true; do 73 | _log_path="${log_path}$version" 74 | if [ -d "$_log_path" ]; then 75 | ((version++)) 76 | else 77 | break 78 | fi 79 | done 80 | 81 | log_path=$_log_path 82 | mkdir -p "$log_path" 83 | 84 | for engine in "${engines[@]}"; do 85 | tmp_file="/tmp/deepplan_${engine}_fig14" 86 | 87 | output_file="$log_path/${engine}.csv" 88 | 89 | awk '$1 ~ /^[0-9]*,/ { print $3 $4 $5 }' $tmp_file > $output_file 90 | 91 | echo "Created '$output_file' log file" 92 | done 93 | 94 | output_file="$log_path/offered_load.csv" 95 | awk '$1 ~ /^[0-9]*,/ { print $2 }' $tmp_file > "$log_path/offered_load.csv" 96 | echo "Created '$output_file' log file" 97 | 98 | is_installed=$(pip list | grep -F matplotlib) 99 | 100 | if [ -z "$is_installed" ]; then 101 | echo "Matplotlib is not installed. So the graph can not be created." 102 | else 103 | eval "python3 graph.py $log_path fig14.pdf" 104 | fi 105 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 2 | 3 | set(DEEPPLAN_SRC 4 | util.cpp 5 | deepplan/model.cpp 6 | deepplan/engine.cpp 7 | ) 8 | 9 | set(NETWORK_SRC 10 | network/network.cpp 11 | network/session.cpp 12 | network/server_api.cpp) 13 | 14 | 15 | set(SERVER_SRC 16 | server.cpp 17 | server/server.cpp 18 | server/controller.cpp 19 | server/model_manager.cpp 20 | server/worker.cpp 21 | ) 22 | 23 | set(CLIENT_SRC 24 | client.cpp 25 | client/client.cpp 26 | client/workload.cpp 27 | ) 28 | 29 | set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 30 | 31 | add_library(deepplan ${DEEPPLAN_SRC}) 32 | target_link_libraries(deepplan PRIVATE 33 | deepplan_proto 34 | tbb 35 | ${Boost_LIBRARIES} 36 | ${TORCH_LIBRARIES} 37 | ) 38 | 39 | add_library(network ${NETWORK_SRC}) 40 | target_link_libraries(network PRIVATE 41 | deepplan_proto 42 | deepcache_proto 43 | tbb 44 | ${Boost_LIBRARIES} 45 | ) 46 | 47 | add_executable(benchmark benchmark.cpp) 48 | target_link_libraries(benchmark 49 | deepplan 50 | ${Boost_LIBRARIES} 51 | ${TORCH_LIBRARIES} 52 | ) 53 | 54 | add_executable(server ${SERVER_SRC}) 55 | target_link_libraries(server 56 | deepplan 57 | network 58 | tbb 59 | ${Boost_LIBRARIES} 60 | ${TORCH_LIBRARIES} 61 | ) 62 | 63 | add_executable(client ${CLIENT_SRC}) 64 | target_link_libraries(client 65 | deepplan 66 | network 67 | tbb 68 | ${Boost_LIBRARIES} 69 | ${TORCH_LIBRARIES} 70 | ) 71 | -------------------------------------------------------------------------------- /src/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | struct BenchmarkOptions { 16 | std::string model_name; 17 | EngineType engine_type; 18 | std::vector devices; 19 | int batch_size; 20 | int num_warmup; 21 | int num_test; 22 | }; 23 | 24 | static struct option long_options[] = 25 | { 26 | {"help", no_argument, 0, 'h' }, 27 | {"model", required_argument, 0, 'm' }, 28 | {"engine", required_argument, 0, 'e' }, 29 | {"devices", required_argument, 0, 'd' }, 30 | {"batch", required_argument, 0, 'b' }, 31 | {0, 0, 0, 0 } 32 | }; 33 | 34 | static void print_usage(char* program_name) { 35 | fprintf(stderr, 36 | "Usage : %s [-h] --model/-m MODEL_NAME [--device/-d DEVICES [DEVICES ...]]\n" 37 | "\t\t[--engine/-e {in_memory,demand,pipeline,deepplan}]\n" 38 | "\t\t[--batch/-b BATCH_SIZE\n", 39 | program_name); 40 | } 41 | 42 | void parseOptions(BenchmarkOptions** benchmark_options, int argc, char** argv) { 43 | *benchmark_options = new BenchmarkOptions(); 44 | auto options = *benchmark_options; 45 | char flag; 46 | 47 | char engine_types[][20] = { "in_memory", "demand", "pipeline", "deepplan"}; 48 | int n_types = sizeof(engine_types) / 20; 49 | bool found = false; 50 | bool pass_model = false; 51 | 52 | options->num_warmup = 20; 53 | options->num_test = 200; 54 | options->batch_size = 1; 55 | options->engine_type = EngineType::IN_MEMORY; 56 | options->devices = std::vector(1, 0); // = [0] 57 | 58 | while ((flag = getopt_long(argc, argv, "b:d:e:hm:", long_options, NULL)) != -1) { 59 | switch (flag) { 60 | case 'h': 61 | print_usage(argv[0]); 62 | break; 63 | case 'm': 64 | options->model_name = std::string(optarg); 65 | pass_model = true; 66 | break; 67 | case 'e': 68 | found = false; 69 | for (int i = 0; i < n_types; i++) { 70 | if (!strcmp(engine_types[i], optarg)) { 71 | options->engine_type = EngineType(i); 72 | found = true; 73 | break; 74 | } 75 | } 76 | 77 | if (!found) { 78 | print_usage(argv[0]); 79 | fprintf(stderr, "[Error] argument --engine/-e: invalid choice: %s (choose from", 80 | optarg); 81 | for (int i = 0; i < n_types; i++) { 82 | fprintf(stderr, " \'%s\'", engine_types[i]); 83 | } 84 | fprintf(stderr, ")\n"); 85 | exit(EXIT_FAILURE); 86 | } 87 | break; 88 | case 'b': 89 | options->batch_size = (int)strtol(optarg, NULL, 10); 90 | break; 91 | case 'd': 92 | optind--; 93 | { 94 | std::vector devices; 95 | for ( ; optind < argc && *argv[optind] != '-'; optind++) { 96 | devices.push_back((int)strtol(argv[optind], NULL, 10)); 97 | } 98 | options->devices = devices; 99 | } 100 | break; 101 | default: 102 | print_usage(argv[0]); 103 | exit(EXIT_FAILURE); 104 | break; 105 | bool found = false; 106 | } 107 | } 108 | 109 | if (!pass_model) { 110 | print_usage(argv[0]); 111 | fprintf(stderr, "[Error] the following arguments are required: --model_name/-m\n"); 112 | exit(EXIT_FAILURE); 113 | } 114 | } 115 | 116 | void benchmark(BenchmarkOptions* options) { 117 | double t1, t2, total_ms = 0; 118 | std::vector latencies; 119 | 120 | int num_warmup = options->num_warmup; 121 | int num_test = options->num_test; 122 | int batch_size = options->batch_size; 123 | at::Device target_device(at::kCUDA, options->devices[0]); 124 | 125 | torch::NoGradGuard no_grad; 126 | 127 | deepplan::Model* model = new deepplan::Model( 128 | options->model_name, 129 | options->engine_type, 130 | options->devices); 131 | 132 | util::InputGenerator input_generator; 133 | 134 | ScriptModuleInput inputs; 135 | input_generator.generate_input(options->model_name, batch_size, &inputs); 136 | 137 | for (auto& input : inputs) { 138 | input = input.toTensor().to(model->target_device); 139 | } 140 | 141 | if (options->engine_type == IN_MEMORY) 142 | model->to(target_device); 143 | 144 | for (int step = 0; step < num_warmup+num_test; step++) { 145 | t1 = util::now(); 146 | 147 | if (options->engine_type == ON_DEMAND) { 148 | model->to(target_device, true); 149 | torch::cuda::synchronize(target_device.index()); 150 | } 151 | 152 | auto outputs = model->forward(inputs); 153 | 154 | torch::cuda::synchronize(target_device.index()); 155 | t2 = util::now(); 156 | 157 | if (options->engine_type != IN_MEMORY) { 158 | model->clear(); 159 | } 160 | 161 | if (step >= num_warmup) { 162 | latencies.push_back((t2-t1) / 1e6); 163 | } 164 | } 165 | 166 | std::sort(latencies.begin(), latencies.end()); 167 | 168 | total_ms = std::accumulate(latencies.begin(), latencies.end(), 0.f); 169 | double avg_latency = total_ms / num_test; 170 | 171 | std::cout << "Average Latency : " << avg_latency << " ms\n"; 172 | std::cout << "Min Latency : " << latencies.front() << " ms\n"; 173 | std::cout << "Max Latency : " << latencies.back() << " ms\n"; 174 | 175 | return; 176 | } 177 | 178 | int main(int argc, char** argv) { 179 | BenchmarkOptions* benchmark_options; 180 | parseOptions(&benchmark_options, argc, argv); 181 | 182 | std::cout << "Benchmarking Inference " << benchmark_options->model_name << "\n"; 183 | 184 | deepplan::Init(); 185 | 186 | benchmark(benchmark_options); 187 | 188 | deepplan::Deinit(); 189 | 190 | return 0; 191 | } 192 | -------------------------------------------------------------------------------- /src/client.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | typedef enum { 9 | SIMPLE = 0, 10 | BURSTY, 11 | AZURE, 12 | } WorkloadType; 13 | 14 | struct ClientOptions { 15 | WorkloadType workload_type; 16 | std::vector model_names; 17 | int concurrency; 18 | int rate; 19 | int mp_size; 20 | EngineType engine_type; 21 | int slo; 22 | int n_warmup; 23 | int n_test; 24 | }; 25 | 26 | static struct option long_options[] = 27 | { 28 | {"help", no_argument, 0, 'h' }, 29 | {"workload", required_argument, 0, 'w' }, 30 | {"model", required_argument, 0, 'm' }, 31 | {"concurrency", required_argument, 0, 'c' }, 32 | {"rate", required_argument, 0, 'r' }, 33 | {"mp_size", required_argument, 0, 'p' }, 34 | {"engine", required_argument, 0, 'e' }, 35 | {"slo", required_argument, 0, 's' }, 36 | {0, 0, 0, 0} 37 | }; 38 | 39 | static void print_usage(char* program_name) { 40 | fprintf(stderr, 41 | "Usage : %s [-h] --workload/-w WORKLOAD --model/-m MODEL_NAME\n" 42 | "\t\t--concurrency/-c CONCURRENCY --rate/-r RATE [--mp_size/-p MP_SIZE]\n" 43 | "\t\t[--engine/-e {in_memory,demand,pipeline,deepplan}]\n" 44 | "\t\t[--slo/-s SLO]\n", 45 | program_name); 46 | } 47 | 48 | void parseOptions(ClientOptions** benchmark_options, int argc, char** argv) { 49 | *benchmark_options = new ClientOptions(); 50 | auto options = *benchmark_options; 51 | char flag; 52 | 53 | char engine_types[][20] = { "in_memory", "demand", "pipeline", "deepplan" }; 54 | char workload_types[][20] = { "simple", "bursty", "azure" }; 55 | int n_engine_types = sizeof(engine_types) / 20; 56 | int n_workload_types = sizeof(workload_types) / 20; 57 | bool found = false; 58 | bool pass_model = false; 59 | bool pass_concurrency = false; 60 | bool pass_rate = false; 61 | 62 | options->mp_size = 1; 63 | options->n_warmup = 1000; 64 | options->n_test = 10000; 65 | options->engine_type = EngineType::DEEPPLAN; 66 | options->slo = 100; 67 | 68 | while ((flag = getopt_long(argc, argv, "c:e:hm:r:s:w:p:", long_options, NULL)) != -1) { 69 | switch (flag) { 70 | case 'h': 71 | print_usage(argv[0]); 72 | break; 73 | case 'm': 74 | optind--; 75 | { 76 | std::vector model_names; 77 | for ( ; optind < argc && *argv[optind] != '-'; optind++) { 78 | model_names.push_back(std::string(argv[optind])); 79 | } 80 | options->model_names = model_names; 81 | } 82 | pass_model = true; 83 | break; 84 | case 'c': 85 | options->concurrency = (int)strtol(optarg, NULL, 10); 86 | pass_concurrency = true; 87 | break; 88 | case 'r': 89 | options->rate = (int)strtol(optarg, NULL, 10); 90 | pass_rate = true; 91 | break; 92 | case 'p': 93 | options->mp_size = (int)strtol(optarg, NULL, 10); 94 | break; 95 | case 's': 96 | options->slo = (int)strtol(optarg, NULL, 10); 97 | break; 98 | case 'e': 99 | found = false; 100 | for (int i = 0; i < n_engine_types; i++) { 101 | if (!strcmp(engine_types[i], optarg)) { 102 | options->engine_type = EngineType(i); 103 | found = true; 104 | break; 105 | } 106 | } 107 | 108 | if (!found) { 109 | print_usage(argv[0]); 110 | fprintf(stderr, "[Error] argument --engine/-e: invalid choice: %s (choose from", 111 | optarg); 112 | for (int i = 0; i < n_engine_types; i++) { 113 | fprintf(stderr, " \'%s\'", engine_types[i]); 114 | } 115 | fprintf(stderr, ")\n"); 116 | exit(EXIT_FAILURE); 117 | } 118 | break; 119 | case 'w': 120 | found = false; 121 | for (int i = 0; i < n_workload_types; i++) { 122 | if (!strcmp(workload_types[i], optarg)) { 123 | options->workload_type = WorkloadType(i); 124 | found = true; 125 | break; 126 | } 127 | } 128 | 129 | if (!found) { 130 | print_usage(argv[0]); 131 | fprintf(stderr, "[Error] argument --workload/-w: invalid choice: %s (choose from", 132 | optarg); 133 | for (int i = 0; i < n_workload_types; i++) { 134 | fprintf(stderr, " \'%s\'", workload_types[i]); 135 | } 136 | fprintf(stderr, ")\n"); 137 | exit(EXIT_FAILURE); 138 | } 139 | break; 140 | default: 141 | print_usage(argv[0]); 142 | exit(EXIT_FAILURE); 143 | break; 144 | bool found = false; 145 | } 146 | } 147 | 148 | if (!(pass_model && pass_concurrency && pass_rate)) { 149 | print_usage(argv[0]); 150 | fprintf(stderr, "[Error] the following arguments are required:"); 151 | if (!pass_model) 152 | fprintf(stderr, " --model_name/-m"); 153 | if (!pass_concurrency) 154 | fprintf(stderr, " --concurrency/-c"); 155 | if (!pass_rate) 156 | fprintf(stderr, " --rate/-r"); 157 | 158 | exit(EXIT_FAILURE); 159 | } 160 | 161 | } 162 | 163 | void simple_experiment(ClientOptions* options) { 164 | std::vector model_names = options->model_names; 165 | int concurrency = options->concurrency; 166 | int rate = options->rate; 167 | int mp_size = options->mp_size; 168 | EngineType engine_type = options->engine_type; 169 | int slo = options->slo; 170 | 171 | int n_warmup = options->n_warmup; 172 | int n_test = rate * 100; 173 | 174 | auto model_loader = new ModelLoader(model_names, concurrency, engine_type, mp_size, 175 | "127.0.0.1", "4321"); 176 | 177 | std::cout << "Upload Model...\n"; 178 | model_loader->run(); 179 | 180 | auto warmup = new Workload(concurrency, rate, n_warmup, "127.0.0.1", "4321"); 181 | auto workload = new Workload(concurrency, rate, n_test, "127.0.0.1", "4321"); 182 | 183 | std::cout << "Warmup...\n"; 184 | warmup->run(model_loader->inputs); 185 | 186 | std::cout << "Test...\n"; 187 | workload->run(model_loader->inputs); 188 | 189 | auto result = workload->result(slo); 190 | 191 | std::cout << "99% Latency: " << result.latency_99 << " ms\n"; 192 | std::cout << "Cold Start Rate: " << result.cold_rate << " %\n"; 193 | std::cout << "Goodput Rate: " << result.goodput_rate << " %\n"; 194 | } 195 | 196 | void bursty_experiment(ClientOptions* options) { 197 | std::vector model_names = options->model_names; 198 | int concurrency = options->concurrency; 199 | int rate = options->rate; 200 | int mp_size = options->mp_size; 201 | int slo = options->slo; 202 | EngineType engine_type = options->engine_type; 203 | 204 | auto model_loader = new ModelLoader(model_names, concurrency, engine_type, mp_size, 205 | "127.0.0.1", "4321"); 206 | 207 | std::cout << "Upload Model...\n"; 208 | model_loader->run(); 209 | 210 | std::vector warmups; 211 | std::vector workloads; 212 | for (int i = 1; i <= concurrency; i++) { 213 | warmups.push_back(new Workload(i, rate, rate, "127.0.0.1", "4321")); 214 | 215 | workloads.push_back(new Workload(i, rate, rate, "127.0.0.1", "4321")); 216 | } 217 | 218 | std::cout << "Bursty Experiment\n"; 219 | std::cout << "Concurrency, 99% Latecny(ms), Cold Start Rate(%), Goodput Rate(%)\n"; 220 | for (int i = 0; i < concurrency; i++) { 221 | warmups[i]->run(model_loader->inputs); 222 | workloads[i]->run(model_loader->inputs); 223 | auto result = workloads[i]->result(slo); 224 | 225 | std::cout << i+1 << ", "; 226 | std::cout << result.latency_99 << ", "; 227 | std::cout << result.cold_rate << ", "; 228 | std::cout << result.goodput_rate << "\n"; 229 | } 230 | } 231 | 232 | void azure_experiment(ClientOptions* options) { 233 | std::vector model_names = options->model_names; 234 | int concurrency = options->concurrency; 235 | int rate = options->rate; 236 | int mp_size = options->mp_size; 237 | EngineType engine_type = options->engine_type; 238 | int slo = options->slo; 239 | 240 | auto model_loader = new ModelLoader(model_names, concurrency, engine_type, mp_size, 241 | "127.0.0.1", "4321"); 242 | 243 | std::cout << "Upload Model...\n"; 244 | model_loader->run(); 245 | 246 | auto scaled_traces = azure::load_scaled_trace(rate, concurrency, 2); 247 | 248 | azure::transpose(scaled_traces); 249 | 250 | int period = 180; 251 | std::vector workloads; 252 | for (int p = 0; p < period; p++) { 253 | workloads.push_back(new Workload(scaled_traces[p], "127.0.0.1", "4321")); 254 | } 255 | 256 | std::cout << "Azure Experiment\n"; 257 | std::cout << "Minutes, Offered Load, 99% Latecny(ms), Cold Start Rate(%), Goodput Rate(%)\n"; 258 | for (int p = 0; p < period; p++) { 259 | workloads[p]->run(model_loader->inputs); 260 | auto result = workloads[p]->result(slo); 261 | 262 | std::cout << p << ", "; 263 | std::cout << workloads[p]->n_requests << ", "; 264 | std::cout << result.latency_99 << ", "; 265 | std::cout << result.cold_rate << ", "; 266 | std::cout << result.goodput_rate << "\n"; 267 | } 268 | 269 | } 270 | 271 | 272 | int main(int argc, char** argv) { 273 | ClientOptions* client_options; 274 | parseOptions(&client_options, argc, argv); 275 | 276 | try { 277 | switch (client_options->workload_type) { 278 | case WorkloadType::SIMPLE: 279 | simple_experiment(client_options); 280 | break; 281 | case WorkloadType::BURSTY: 282 | bursty_experiment(client_options); 283 | break; 284 | case WorkloadType::AZURE: 285 | azure_experiment(client_options); 286 | break; 287 | } 288 | } 289 | catch (std::exception& e) { 290 | std::cerr << e.what() << "\n"; 291 | } 292 | 293 | return 0; 294 | } 295 | -------------------------------------------------------------------------------- /src/client/azure.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace azure { 4 | 5 | std::string get_trace_dir() { 6 | auto trace_dir = std::getenv("AZURE_TRACE_DIR"); 7 | 8 | if (trace_dir == nullptr) { return ""; } 9 | return trace_dir == nullptr ? "" : std::string(trace_dir); 10 | } 11 | 12 | std::string get_trace_file(std::string trace_dir, int id) { 13 | std::stringstream ss; 14 | 15 | ss << trace_dir << "/invocations_per_function_md.anon.d"; 16 | if (id < 10) 17 | ss << "0"; 18 | ss << id << ".csv"; 19 | 20 | return ss.str(); 21 | } 22 | 23 | std::string get_trace(int id) { 24 | std::string trace_dir = get_trace_dir(); 25 | 26 | if (trace_dir == "") { 27 | std::cerr << "AZURE_TRACE_DIR variable not set, exiting\n"; 28 | exit(EXIT_FAILURE); 29 | } 30 | 31 | if (1 > id || id > 14) { 32 | std::cerr << "Azure workload_id must be between 1 and 14 inclusive. Got " 33 | << id << "\n"; 34 | exit(EXIT_FAILURE); 35 | } 36 | 37 | std::string trace_file = get_trace_file(trace_dir, id); 38 | 39 | return trace_file; 40 | } 41 | 42 | std::vector split(std::string line) { 43 | std::vector result; 44 | std::stringstream s(line); 45 | while (s.good()) { 46 | std::string substr; 47 | std::getline(s, substr, ','); 48 | result.push_back(substr); 49 | } 50 | return result; 51 | } 52 | 53 | std::vector process_trace_line(std::string line, unsigned start_index) { 54 | std::vector splits = split(line); 55 | std::vector result; 56 | for (unsigned i = start_index; i < splits.size(); i++) { 57 | result.push_back(std::stoul(splits[i].c_str(), NULL, 10)); 58 | } 59 | return result; 60 | } 61 | 62 | std::vector> read_trace_data(std::string filename) { 63 | std::ifstream f(filename); 64 | 65 | std::vector> results; 66 | std::vector> sizes; 67 | 68 | std::string line; 69 | std::getline(f, line); // Skip headers 70 | while (std::getline(f, line)) { 71 | auto traceline = process_trace_line(line, 4); 72 | int size = std::accumulate(traceline.begin(), traceline.end(), 0); 73 | sizes.push_back(std::make_pair(size, results.size())); 74 | results.push_back(traceline); 75 | } 76 | 77 | std::sort(sizes.begin(), sizes.end()); 78 | 79 | std::vector> ordered; 80 | for (int i = sizes.size()-1; i >= 0; i--) { 81 | ordered.push_back(results[sizes[i].second]); 82 | } 83 | 84 | return ordered; 85 | } 86 | 87 | std::vector> load_trace(int workload_id = 1) { 88 | return read_trace_data(get_trace(workload_id)); 89 | } 90 | 91 | std::vector> scale_trace_rate(std::vector>& traces, int rate) { 92 | std::vector> scaled_traces(traces.size()); 93 | 94 | unsigned total_size = 0; 95 | double total_rate = 0; 96 | double scale_ratio = 0; 97 | 98 | for (auto& trace : traces) 99 | total_size += std::accumulate(trace.begin(), trace.end(), 0); 100 | 101 | total_rate = total_size / 24.0 / 60.0 / 60.0; // caculate rate(r/s); 102 | scale_ratio = rate / total_rate; 103 | 104 | std::transform(traces.begin(), traces.end(), scaled_traces.begin(), 105 | [scale_ratio](auto vec) { 106 | for (auto& v : vec) v *= scale_ratio; 107 | return vec; 108 | }); 109 | 110 | return scaled_traces; 111 | } 112 | 113 | std::vector> scale_trace_size(std::vector>& traces, int size) { 114 | std::vector> scaled_traces(size); 115 | 116 | for (int i = 0; i < size; i++) { 117 | scaled_traces[i] = traces[i]; 118 | } 119 | 120 | // Compress the traces size 121 | for (int i = size; i < traces.size(); i++) { 122 | for (int j = 0; j < traces[i].size(); j++) { 123 | scaled_traces[i % size][j] += traces[i][j]; 124 | } 125 | } 126 | 127 | return scaled_traces; 128 | } 129 | 130 | std::vector> load_scaled_trace(int rate, int size, int workload_id = 1) { 131 | auto traces = load_trace(workload_id); 132 | 133 | auto scaled_traces = scale_trace_rate(traces, rate); 134 | scaled_traces = scale_trace_size(scaled_traces, size); 135 | 136 | return scaled_traces; 137 | } 138 | 139 | template 140 | void transpose(std::vector> &m) { 141 | if (m.size() == 0) 142 | return; 143 | 144 | std::vector> trans_vec(m[0].size(), std::vector()); 145 | 146 | for (int i = 0; i < m.size(); i++) { 147 | for (int j = 0; j < m[i].size(); j++) { 148 | trans_vec[j].push_back(m[i][j]); 149 | } 150 | } 151 | 152 | m = trans_vec; 153 | } 154 | 155 | } 156 | -------------------------------------------------------------------------------- /src/client/client.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | Client::Client() 8 | : alive(true), 9 | network_thr(std::bind(&Client::run, this)) {}; 10 | 11 | void Client::infer_async(std::vector& input, int model_id, 12 | std::function onSuccess) { 13 | serverapi::InferenceRequest request; 14 | 15 | request.model_id = model_id; 16 | request.batch_size = 1; 17 | request.input_size = input.size(); 18 | request.input = input.data(); 19 | 20 | session->send_request_async(request, onSuccess); 21 | } 22 | 23 | Client::~Client() { 24 | if (alive) 25 | shutdown(); 26 | } 27 | 28 | serverapi::UploadModelResponse* Client::upload_model(std::vector model_names, int n_models, EngineType engine_type, int mp_size) { 29 | serverapi::UploadModelRequest request; 30 | 31 | request.model_names = model_names; 32 | request.n_models = n_models; 33 | request.engine_type = engine_type; 34 | request.mp_size = mp_size; 35 | 36 | auto onSuccess = [this](serverapi::Response* rsp) { 37 | std::cout << "Success Upload\n"; 38 | }; 39 | 40 | auto response = dynamic_cast 41 | (session->send_request(request, onSuccess)); 42 | 43 | return response; 44 | } 45 | 46 | void Client::close() { 47 | serverapi::CloseRequest request; 48 | 49 | auto onSuccess = [this](serverapi::Response* rsp) { 50 | }; 51 | 52 | auto response = session->send_request(request, onSuccess); 53 | } 54 | 55 | void Client::connect(const std::string& srv_ip, const std::string& port) { 56 | try { 57 | session = new network::ClientSession(io_service_); 58 | 59 | session->connect(srv_ip, port); 60 | } 61 | catch (std::exception& e) { io_service_.stop(); 62 | std::cerr << e.what() << "\n"; 63 | } 64 | 65 | return; 66 | } 67 | 68 | void Client::run() { 69 | while (alive) { 70 | try { 71 | boost::asio::io_service::work work(io_service_); 72 | io_service_.run(); 73 | } catch (std::exception& e) { 74 | alive.store(false); 75 | std::cerr << "Exception in network thread: " << e.what(); 76 | } catch (const char* m) { 77 | alive.store(false); 78 | std::cerr << "Exception in network thread: " << m; 79 | } 80 | } 81 | } 82 | 83 | void Client::shutdown() { 84 | session->await_completion(); 85 | 86 | this->close(); 87 | 88 | alive.store(false); 89 | io_service_.stop(); 90 | if (network_thr.joinable()) 91 | network_thr.join(); 92 | } 93 | -------------------------------------------------------------------------------- /src/client/client.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | class Client { 8 | public: 9 | Client(); 10 | 11 | ~Client(); 12 | 13 | void connect(const std::string& srv_ip, const std::string& port); 14 | 15 | void run(); 16 | 17 | void infer_async(std::vector& input, int model_id, 18 | std::function onSuccess); 19 | 20 | serverapi::UploadModelResponse* upload_model(std::vector model_name, int n_models, EngineType engine_type, int mp_size); 21 | 22 | void close(); 23 | 24 | void sync_close(); 25 | 26 | void shutdown(); 27 | 28 | private: 29 | std::atomic_bool alive; 30 | std::thread network_thr; 31 | boost::asio::io_service io_service_; 32 | network::ClientSession* session; 33 | }; 34 | -------------------------------------------------------------------------------- /src/client/workload.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | Workload::Workload(int concurrency, int rate, 4 | int n_requests, std::string addr, std::string port) 5 | : concurrency(concurrency), 6 | rate(rate), 7 | n_requests(n_requests), 8 | _traces(n_requests), 9 | addr(addr), 10 | port(port) { 11 | std::minstd_rand gen(0); 12 | std::uniform_int_distribution<> udist(0, concurrency-1); 13 | std::exponential_distribution edist(rate); 14 | 15 | for (auto& trace : _traces) { 16 | trace.first = edist(gen); 17 | trace.second = udist(gen); 18 | } 19 | }; 20 | 21 | Workload::Workload(std::vector& rates, 22 | std::string addr, std::string port) 23 | : _traces(0), 24 | addr(addr), 25 | port(port) { 26 | std::minstd_rand gen(0); 27 | 28 | int cnt = 0; 29 | for (int i = 0; i < rates.size(); i++) { 30 | double itv = 0; 31 | std::exponential_distribution edist(rates[i]/60.0); 32 | cnt += rates[i]; 33 | 34 | itv = edist(gen); 35 | while (itv < 60) { 36 | _traces.push_back({itv, i}); 37 | itv += edist(gen); 38 | } 39 | } 40 | 41 | sort(_traces.begin(), _traces.end(), 42 | [](auto& a, auto& b) { return a.first < b.first;}); 43 | 44 | for (int i = _traces.size()-1; i > 0; i--) { 45 | _traces[i].first -= _traces[i-1].first; 46 | } 47 | 48 | n_requests = _traces.size(); 49 | }; 50 | 51 | void Workload::run(std::vector>& inputs) { 52 | client.connect(addr, port); 53 | 54 | for (auto& trace : _traces) { 55 | double interval = trace.first; 56 | int model_id = trace.second; 57 | 58 | usleep(interval*1e6); 59 | 60 | uint64_t t_send = util::now(); 61 | auto onSuccess = [this, t_send](serverapi::Response* rsp) { 62 | auto response = dynamic_cast(rsp); 63 | uint64_t t_receive = util::now(); 64 | uint64_t latency = (t_receive-t_send) / 1e6; 65 | 66 | this->latencies.push_back(latency); 67 | if (response->is_cold) this->cold_start_cnt++; 68 | }; 69 | 70 | client.infer_async(inputs[model_id], model_id, onSuccess); 71 | } 72 | 73 | client.shutdown(); 74 | } 75 | 76 | WorkloadResult Workload::result(int slo) { 77 | WorkloadResult result; 78 | 79 | std::sort(latencies.begin(), latencies.end()); 80 | 81 | int index_99 = latencies.size() * 0.99 - 1; 82 | int goodput_cnt = 0; 83 | 84 | for (auto& latency : latencies) 85 | if (latency <= slo) goodput_cnt++; 86 | 87 | result.latency_99 = latencies[index_99]; 88 | result.cold_rate = (double)cold_start_cnt / n_requests * 100; 89 | result.goodput_rate = (double)goodput_cnt / n_requests * 100; 90 | 91 | return result; 92 | } 93 | 94 | ModelLoader::ModelLoader(std::vector model_names, 95 | int n_models, EngineType engine_type, 96 | int mp_size, std::string addr, std::string port) 97 | : model_names(model_names), 98 | n_models(n_models), 99 | engine_type(engine_type), 100 | mp_size(mp_size), 101 | addr(addr), 102 | port(port) {}; 103 | 104 | void ModelLoader::run() { 105 | client.connect(addr, port); 106 | 107 | util::InputGenerator input_generator; 108 | 109 | inputs.resize(n_models); 110 | 111 | int n_models_per_type = n_models / model_names.size(); 112 | for (int i = 0; i < n_models; i++) { 113 | input_generator.generate_input(model_names[i/n_models_per_type], 1, &inputs[i]); 114 | } 115 | 116 | client.upload_model(model_names, n_models, engine_type, mp_size); 117 | 118 | for (int i = 0; i < n_models; i++) { 119 | auto onSuccess = [this](serverapi::Response* rsp) {}; 120 | 121 | client.infer_async(inputs[i], i, onSuccess); 122 | } 123 | 124 | client.shutdown(); 125 | } 126 | 127 | -------------------------------------------------------------------------------- /src/client/workload.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | struct WorkloadResult { 8 | double latency_99; 9 | double cold_rate; 10 | double goodput_rate; 11 | }; 12 | 13 | class Workload { 14 | public: 15 | Workload(int concurrency, int rate, 16 | int n_requests, std::string addr, std::string port); 17 | 18 | Workload(std::vector& rates, 19 | std::string addr, std::string port); 20 | 21 | void run(std::vector>& inputs); 22 | 23 | WorkloadResult result(int slo); 24 | 25 | Client client; 26 | 27 | std::vector model_names; 28 | int concurrency; 29 | int rate; 30 | int n_requests; 31 | std::string addr; 32 | std::string port; 33 | 34 | private: 35 | std::vector> _traces; 36 | std::vector latencies; 37 | int cold_start_cnt = 0; 38 | }; 39 | 40 | class ModelLoader { 41 | public: 42 | ModelLoader(std::vector model_name, 43 | int n_models, EngineType engine_type, 44 | int mp_size, std::string addr, std::string port); 45 | 46 | void run(); 47 | 48 | Client client; 49 | 50 | std::vector> inputs; 51 | std::vector model_names; 52 | int n_models; 53 | EngineType engine_type; 54 | int mp_size; 55 | std::string addr; 56 | std::string port; 57 | }; 58 | -------------------------------------------------------------------------------- /src/deepplan/engine.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "tbb/concurrent_queue.h" 10 | #include 11 | #include 12 | 13 | namespace deepplan { 14 | 15 | class PCIeThread; 16 | class NVLinkThread; 17 | 18 | static std::vector g_exec_streams; 19 | static std::vector g_pcie_thrs; 20 | static std::vector g_nvlink_thrs; 21 | static int n_device; 22 | 23 | class LoadThread { 24 | public: 25 | LoadThread(int device) 26 | : device_(device), 27 | is_finished(false), 28 | stream(c10::cuda::getStreamFromPool(false, device)) {}; 29 | 30 | struct Task { 31 | public: 32 | Task(std::vector modules, int device) 33 | : type(Type::request), 34 | modules(modules), 35 | device(device) {}; 36 | 37 | Task() 38 | : type(Type::end) {}; 39 | 40 | enum class Type { 41 | request = 0, 42 | end 43 | } type; 44 | 45 | std::vector modules; 46 | int device; 47 | }; 48 | 49 | void transfer_modules(std::vector& modules, int target_device) { 50 | if (!modules.empty()) 51 | queue.push(std::make_shared(modules, target_device)); 52 | } 53 | 54 | virtual void init() = 0; 55 | 56 | virtual void Loop() = 0; 57 | 58 | void stop() { 59 | is_finished = true; 60 | queue.push(std::make_shared()); // Insert EndofItem 61 | if (thr.joinable()) 62 | thr.join(); 63 | } 64 | 65 | protected: 66 | tbb::concurrent_bounded_queue> queue; 67 | c10::cuda::CUDAStream stream; 68 | std::thread thr; 69 | std::atomic is_finished; 70 | int device_; 71 | }; 72 | 73 | class NVLinkThread : public LoadThread { 74 | public: 75 | NVLinkThread(int device) 76 | : LoadThread(device) { init(); }; 77 | 78 | void init() { 79 | thr = std::thread(std::bind(&NVLinkThread::Loop, this)); 80 | } 81 | 82 | void Loop() { 83 | at::Device device(at::kCUDA, device_); 84 | at::cuda::CUDAStreamGuard guard(stream); 85 | c10::cuda::CUDAGuard device_guard(device); 86 | 87 | std::shared_ptr task; 88 | 89 | while (!is_finished) { 90 | queue.pop(task); 91 | if (task->type == Task::Type::end) { 92 | break; 93 | } 94 | at::Device target_device(at::kCUDA, task->device); 95 | 96 | for (auto& module : task->modules) { 97 | module.synchronize(device); 98 | module.to_and_record(target_device, true); 99 | } 100 | } 101 | } 102 | 103 | void transfer_modules(ScriptModule module, int target_device) { 104 | std::vector modules; 105 | modules.push_back(std::move(module)); 106 | queue.push(std::make_shared(modules, target_device)); 107 | } 108 | }; 109 | 110 | class PCIeThread : public LoadThread { 111 | public: 112 | PCIeThread(int device) 113 | : LoadThread(device) { init(); }; 114 | 115 | void init() { 116 | thr = std::thread(std::bind(&PCIeThread::Loop, this)); 117 | } 118 | 119 | void Loop() { 120 | at::Device device(at::kCUDA, device_); 121 | at::cuda::CUDAStreamGuard guard(stream); 122 | c10::cuda::CUDAGuard device_guard(device); 123 | 124 | std::shared_ptr task; 125 | 126 | while (!is_finished) { 127 | queue.pop(task); 128 | if (task->type == Task::Type::end) { 129 | break; 130 | } 131 | 132 | int target_device = task->device; 133 | 134 | for (auto& module : task->modules) { 135 | module.to_and_record(device, true); 136 | 137 | if (target_device != device_) { 138 | g_nvlink_thrs[device_]->transfer_modules(module, target_device); 139 | } 140 | } 141 | } 142 | } 143 | }; 144 | 145 | void Init(void) { 146 | n_device = torch::cuda::device_count(); 147 | torch::jit::getBailoutDepth() = 0; 148 | 149 | g_pcie_thrs.resize(n_device); 150 | g_nvlink_thrs.resize(n_device); 151 | 152 | for (int i = 0; i < n_device; i++) { 153 | g_pcie_thrs[i] = new PCIeThread(i); 154 | g_nvlink_thrs[i] = new NVLinkThread(i); 155 | g_exec_streams.push_back(std::move(c10::cuda::getStreamFromPool(false, i))); 156 | } 157 | } 158 | 159 | void Deinit(void) { 160 | for (int i = 0; i < n_device; i++) { 161 | g_pcie_thrs[i]->stop(); 162 | g_nvlink_thrs[i]->stop(); 163 | } 164 | } 165 | 166 | class PipelineEngine : public Engine { 167 | public: 168 | PipelineEngine() 169 | : Engine() {}; 170 | 171 | torch::jit::IValue run(Model* model, ScriptModuleInput& x) { 172 | int target_device = model->target_device.index(); 173 | torch::jit::IValue outputs; 174 | 175 | assert(n_device > target_device); 176 | 177 | if (!model->is_cuda) { 178 | 179 | for (int device : model->devices) { 180 | std::vector modules; 181 | for (auto idx : model->device_map[device]) { 182 | modules.push_back(model->layers[idx]); 183 | } 184 | g_pcie_thrs[device]->transfer_modules(modules, target_device); 185 | } 186 | } 187 | 188 | { 189 | at::cuda::CUDAStreamGuard stream_guard(g_exec_streams[target_device]); 190 | outputs = model->model.forward(x); 191 | } 192 | model->is_cuda = true; 193 | 194 | return outputs; 195 | } 196 | }; 197 | 198 | static PipelineEngine engine; 199 | 200 | torch::jit::IValue RunEngine(Model* model, ScriptModuleInput& x) { 201 | c10::cuda::CUDAGuard device_guard(model->target_device); 202 | auto outputs = engine.run(model, x); 203 | 204 | return outputs; 205 | } 206 | 207 | } 208 | -------------------------------------------------------------------------------- /src/deepplan/engine.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace deepplan { 8 | 9 | class Engine { 10 | public: 11 | virtual torch::jit::IValue run(Model* model, ScriptModuleInput& x) = 0; 12 | }; 13 | 14 | torch::jit::IValue RunEngine(Model* model, ScriptModuleInput& x); 15 | 16 | void Init(void); 17 | 18 | void Deinit(void); 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/deepplan/model.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace deepplan { 10 | 11 | static std::vector travel_layers(ScriptModule module, std::string name="") { 12 | std::vector traveled_layers; 13 | 14 | if (module.children().size() == 0) { 15 | traveled_layers.push_back(module); 16 | return traveled_layers; 17 | } 18 | else { 19 | for (auto name_child : module.named_children()) { 20 | if (name_child.name.find("drop") != std::string::npos) continue; 21 | auto layers = travel_layers(name_child.value, name_child.name); 22 | traveled_layers.insert(traveled_layers.end(), layers.begin(), layers.end()); 23 | } 24 | return traveled_layers; 25 | } 26 | } 27 | 28 | Model::Model(const std::string name, const EngineType type, const std::vector devices) 29 | : model_name(name), 30 | engine_type(type) { 31 | if (!devices.empty()) { 32 | this->devices = devices; 33 | } 34 | this->target_device = at::Device(at::kCUDA, this->devices[0]); 35 | init(); 36 | } 37 | 38 | void Model::init() { 39 | auto model_repo = std::getenv("PLAN_REPO"); 40 | 41 | if (model_repo == nullptr) { 42 | std::cerr << "PLAN_REPO variable not set, exiting\n"; 43 | exit(EXIT_FAILURE); 44 | } 45 | 46 | std::string model_prefix; 47 | std::string script_name; 48 | std::string script_path; 49 | std::string config_path; 50 | 51 | model_prefix = std::string(model_repo) + "/" + model_name; 52 | { 53 | std::ostringstream ss; 54 | ss << "model" << int(target_device.index()) << ".pt"; 55 | script_name = ss.str(); 56 | } 57 | script_path = model_prefix + "/" + script_name; 58 | config_path = model_prefix + "/config.pbtxt"; 59 | 60 | try { 61 | this->model = torch::jit::load(script_path); 62 | if (!util::read_from_pbtxt(this->model_config, config_path)) { 63 | std::stringstream msg; 64 | msg << "Failed to read " << config_path; 65 | throw std::runtime_error(msg.str()); 66 | } 67 | for (auto io : model_config.inputs()) { 68 | this->input_configs.emplace_back(io); 69 | } 70 | } 71 | catch (const c10::Error& e) { 72 | std::cerr << "Error loading the model\n"; 73 | throw e; 74 | } 75 | catch (const std::exception& e) { 76 | std::cerr << e.what() << "\n"; 77 | throw e; 78 | } 79 | 80 | this->layers = travel_layers(this->model); 81 | this->n_layers = this->layers.size(); 82 | this->model.eval(); 83 | this->model.to(at::kCPU); 84 | { 85 | c10::cuda::CUDAGuard device_guard(this->target_device); 86 | this->model.cuda_host(); 87 | } 88 | 89 | switch (engine_type) { 90 | case EngineType::IN_MEMORY: 91 | case EngineType::ON_DEMAND: 92 | case EngineType::PIPESWITCH: 93 | for (int i = 0; i < this->n_layers; i++) { 94 | this->load_layer_idxs.push_back(i); 95 | } 96 | break; 97 | 98 | case EngineType::DEEPPLAN: 99 | for (auto plan : this->model_config.plans()) { 100 | if (Plan::DYNAMIC == plan.plan_type()) { 101 | auto ll = plan.load_layers(); 102 | this->load_layer_idxs = std::vector(ll.begin(), ll.end()); 103 | break; 104 | } 105 | } 106 | break; 107 | default: 108 | std::cerr << "Found incorrect EngineType\n"; 109 | break; 110 | } 111 | 112 | for (auto& i : this->load_layer_idxs) { 113 | this->layers[i].to(at::kCPU); 114 | this->layers[i].pin_memory(); 115 | } 116 | 117 | // Set device_map 118 | this->model_size = util::getModuleSize(this->model, true); 119 | { 120 | int n_device = devices.size(); 121 | size_t block_size = model_size / n_device; 122 | auto iter = load_layer_idxs.begin(); 123 | 124 | for (int i = 0; i < n_device; i++) { 125 | int device = devices[i]; 126 | size_t cumm_size = 0; 127 | size_t layer_size = 0; 128 | std::vector layer_list; 129 | 130 | for (iter; iter != load_layer_idxs.end(); iter++) { 131 | layer_size = util::getModuleSize(layers[*iter]); 132 | cumm_size += layer_size; 133 | if (cumm_size > block_size) { 134 | break; 135 | } 136 | 137 | layer_list.push_back(*iter); 138 | } 139 | 140 | // Insert remain layers to last device 141 | if (i == n_device-1) { 142 | for (iter; iter != load_layer_idxs.end(); iter++) { 143 | layer_list.push_back(*iter); 144 | } 145 | } 146 | 147 | device_map[device] = layer_list; 148 | } 149 | } 150 | 151 | // TODO 152 | // If using parallel transfer, the devices other than the target device 153 | // convert cuda_host to pin_memory 154 | 155 | model.cuda_backup(); 156 | this->is_cuda = false; 157 | } 158 | 159 | torch::jit::IValue Model::forward(ScriptModuleInput& x) { 160 | auto outputs = RunEngine(this, x); 161 | return outputs; 162 | } 163 | 164 | void Model::to(at::Device device, bool non_blocking) { 165 | model.to(device, non_blocking); 166 | if (device.is_cuda()) 167 | is_cuda = true; 168 | else 169 | is_cuda = false; 170 | } 171 | 172 | void Model::clear() 173 | { 174 | if (this->is_cuda) { 175 | model.clear(); 176 | is_cuda = false; 177 | } 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /src/deepplan/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace deepplan { 9 | 10 | class Model { 11 | public: 12 | Model(const std::string name, const EngineType type, const std::vector devices); 13 | 14 | Model() {}; 15 | 16 | void init(); 17 | 18 | torch::jit::IValue forward(ScriptModuleInput& x); 19 | 20 | void to(at::Device device, bool non_blocking = false); 21 | 22 | void clear(); 23 | 24 | std::string model_name; 25 | 26 | EngineType engine_type; 27 | 28 | std::vector devices = {0}; 29 | 30 | at::Device target_device = at::kCUDA; 31 | 32 | ScriptModule model; 33 | 34 | size_t model_size; 35 | 36 | std::vector layers; 37 | 38 | std::unordered_map> device_map; 39 | 40 | int n_layers; 41 | 42 | std::vector load_layer_idxs; 43 | 44 | std::atomic is_cuda; 45 | 46 | ModelConfig model_config; 47 | 48 | std::vector input_configs; 49 | }; 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/network/message.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace network { 6 | class message_tx { 7 | public: 8 | virtual uint64_t get_tx_hdr_len() const = 0; 9 | virtual uint64_t get_tx_body_len() const = 0; 10 | virtual uint64_t get_tx_req_id() const = 0; 11 | virtual uint64_t get_tx_msg_type() const = 0; 12 | virtual const void* tx_body_buf() = 0; 13 | 14 | virtual void serialize_header(void* dest) = 0; 15 | }; 16 | 17 | class message_rx { 18 | public: 19 | virtual void header_received(const void* hdr, size_t hdr_len) = 0; 20 | virtual uint64_t get_rx_body_len() const = 0; 21 | virtual uint64_t get_rx_req_id() const = 0; 22 | virtual uint64_t get_rx_msg_type() const = 0; 23 | virtual void* rx_body_buf() = 0; 24 | 25 | virtual void body_buf_received(size_t len) = 0; 26 | }; 27 | 28 | template 29 | class msg_protobuf_tx : public message_tx { 30 | protected: 31 | uint64_t req_id_; 32 | 33 | public: 34 | TMsg msg; 35 | static const uint64_t MsgType = TMsgType; 36 | 37 | void set_req_id(uint64_t req_id) { req_id_ = req_id; }; 38 | 39 | virtual uint64_t get_tx_hdr_len() const { return msg.ByteSizeLong(); }; 40 | virtual uint64_t get_tx_body_len() const { return 0; }; 41 | virtual uint64_t get_tx_req_id() const { return req_id_; }; 42 | virtual uint64_t get_tx_msg_type() const { return MsgType; }; 43 | 44 | virtual void serialize_header(void* dest) { 45 | msg.SerializeToArray(dest, get_tx_hdr_len()); 46 | } 47 | 48 | virtual const void* tx_body_buf() { 49 | throw "Should not be called"; 50 | } 51 | 52 | virtual void set(TReq &request) = 0; 53 | }; 54 | 55 | template 56 | class msg_protobuf_rx : public message_rx { 57 | protected: 58 | uint64_t req_id_; 59 | 60 | public: 61 | TMsg msg; 62 | static const uint64_t MsgType = TMsgType; 63 | 64 | virtual void header_received(const void* hdr, size_t hdr_len) { 65 | if (!msg.ParseFromArray(hdr, hdr_len)) 66 | std::cerr << "parsing failed\n"; 67 | } 68 | 69 | void set_req_id(uint64_t req_id) { req_id_ = req_id; }; 70 | 71 | virtual uint64_t get_rx_req_id() const { return req_id_; }; 72 | virtual uint64_t get_rx_body_len() const { return 0; }; 73 | virtual uint64_t get_rx_msg_type() const { return MsgType; }; 74 | 75 | virtual void* rx_body_buf() { 76 | throw "Should not be called"; 77 | } 78 | 79 | virtual void body_buf_received(size_t len) { 80 | throw "Should not be called"; 81 | } 82 | 83 | virtual void get(TRsp& response) = 0; 84 | 85 | }; 86 | 87 | template 88 | class msg_protobuf_tx_with_body : public msg_protobuf_tx { 89 | protected: 90 | size_t body_len_ = 0; 91 | void* body_ = nullptr; 92 | 93 | public: 94 | virtual void set_body_len(size_t body_len) { body_len_ = body_len; } 95 | 96 | virtual uint64_t get_tx_body_len() const { return body_len_; } 97 | 98 | virtual const void* tx_body_buf() { 99 | return body_; 100 | } 101 | 102 | }; 103 | 104 | template 105 | class msg_protobuf_rx_with_body : public msg_protobuf_rx { 106 | protected: 107 | size_t body_len_ = 0; 108 | void* body_ = nullptr; 109 | 110 | public: 111 | virtual void set_body_len(size_t body_len) { 112 | body_len_ = body_len; 113 | body_ = new uint8_t[body_len]; 114 | } 115 | 116 | virtual uint64_t get_rx_body_len() const { return body_len_; } 117 | 118 | virtual void* rx_body_buf() { 119 | return body_; 120 | } 121 | 122 | virtual void body_buf_received(size_t len) { 123 | } 124 | }; 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/network/network.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace network { 5 | 6 | message_receiver::message_receiver(message_connection* conn, message_handler& handler) 7 | : socket_(conn->get_socket()), 8 | conn_(conn), 9 | handler_(handler) {}; 10 | 11 | void message_receiver::start() { 12 | read_new_message(); 13 | } 14 | 15 | void message_receiver::read_new_message() { 16 | boost::asio::async_read(socket_, boost::asio::buffer(pre_header, 32), 17 | boost::bind(&message_receiver::handle_pre_hdr_read, this, 18 | boost::asio::placeholders::error, 19 | boost::asio::placeholders::bytes_transferred)); 20 | } 21 | 22 | void message_receiver::handle_pre_hdr_read(const boost::system::error_code& error, 23 | size_t bytes_transferred) { 24 | if (error) { 25 | std::cerr << "[Error:handle_pre_hdr_read] " << error.message().data() << "\n"; 26 | return; 27 | } 28 | 29 | boost::asio::async_read(socket_, boost::asio::buffer(header_buf, pre_header[0]), 30 | boost::bind(&message_receiver::handle_hdr_read, this, 31 | boost::asio::placeholders::error, 32 | boost::asio::placeholders::bytes_transferred)); 33 | } 34 | 35 | void message_receiver::handle_hdr_read(const boost::system::error_code& error, 36 | size_t bytes_transferred) { 37 | if (error) { 38 | std::cerr << "[Error:hanlde_hdr_read] " << error.message().data() << "\n"; 39 | return; 40 | } 41 | 42 | res_ = handler_.new_rx_message(pre_header[0], pre_header[1], pre_header[2], pre_header[3]); 43 | res_->header_received(header_buf, pre_header[0]); 44 | 45 | int64_t body_len = res_->get_rx_body_len(); 46 | 47 | if (body_len > 0) { 48 | boost::asio::async_read(socket_, boost::asio::buffer(res_->rx_body_buf(), body_len), 49 | boost::bind(&message_receiver::handle_body_read, this, 50 | boost::asio::placeholders::error, 51 | boost::asio::placeholders::bytes_transferred)); 52 | 53 | } 54 | else { 55 | handle_read_end(); 56 | } 57 | 58 | } 59 | 60 | void message_receiver::handle_read_end() { 61 | bool is_continue; 62 | is_continue = handler_.completed_receive(conn_, res_); 63 | if (!is_continue) 64 | return; 65 | 66 | res_ = 0; 67 | read_new_message(); 68 | } 69 | 70 | void message_receiver::handle_body_read(const boost::system::error_code& error, 71 | size_t bytes_transferred) { 72 | if (error) { 73 | std::cerr << "[Error:handle_body_read] " << error.message().data() << "\n"; 74 | return; 75 | } 76 | 77 | res_->body_buf_received(bytes_transferred); 78 | 79 | handle_read_end(); 80 | } 81 | 82 | message_sender::message_sender(message_connection* conn, message_handler& handler) 83 | : socket_(conn->get_socket()), 84 | conn_(conn), 85 | handler_(handler), 86 | req_(0) {}; 87 | 88 | void message_sender::send_message(message_tx& req) { 89 | tx_queue_.push(&req); 90 | conn_->io_service_.post(boost::bind(&message_sender::try_send, this)); 91 | } 92 | 93 | void message_sender::try_send() { 94 | std::lock_guard lock(queue_mutex); 95 | 96 | if (!req_) send_next_message(); 97 | } 98 | 99 | void message_sender::send_next_message() { 100 | message_tx *req; 101 | if (!tx_queue_.try_pop(req)) { 102 | return; 103 | } 104 | start_send(*req); 105 | } 106 | 107 | void message_sender::start_send(message_tx& req) { 108 | pre_header[0] = req.get_tx_hdr_len(); 109 | pre_header[1] = req.get_tx_body_len(); 110 | pre_header[2] = req.get_tx_req_id(); 111 | pre_header[3] = req.get_tx_msg_type(); 112 | 113 | req.serialize_header(header_buf); 114 | 115 | req_ = &req; 116 | 117 | boost::asio::async_write(socket_, boost::asio::buffer(pre_header, 32), 118 | boost::bind(&message_sender::handle_pre_hdr_write, this, 119 | boost::asio::placeholders::error, 120 | boost::asio::placeholders::bytes_transferred)); 121 | } 122 | 123 | void message_sender::handle_pre_hdr_write(const boost::system::error_code& error, 124 | size_t bytes_transferred) { 125 | if (error) { 126 | std::cerr << "[Error:handle_pre_hdr_write] " << error.message().data() << "\n"; 127 | return; 128 | } 129 | 130 | boost::asio::async_write(socket_, boost::asio::buffer(header_buf, pre_header[0]), 131 | boost::bind(&message_sender::handle_hdr_write, this, 132 | boost::asio::placeholders::error, 133 | boost::asio::placeholders::bytes_transferred)); 134 | } 135 | 136 | void message_sender::handle_hdr_write(const boost::system::error_code& error, 137 | size_t bytes_transferred) { 138 | if (error) { 139 | std::cerr << "[Error:handle_hdr_write] " << error.message().data() << "\n"; 140 | return; 141 | } 142 | 143 | uint64_t body_len = req_->get_tx_body_len(); 144 | 145 | if (body_len > 0) { 146 | boost::asio::async_write(socket_, boost::asio::buffer(req_->tx_body_buf(), body_len), 147 | boost::bind(&message_sender::handle_body_write, this, 148 | boost::asio::placeholders::error, 149 | boost::asio::placeholders::bytes_transferred)); 150 | } 151 | else { 152 | handle_write_end(); 153 | } 154 | } 155 | 156 | void message_sender::handle_write_end() { 157 | handler_.completed_transmit(conn_, req_); 158 | 159 | std::lock_guard lock(queue_mutex); 160 | req_ = 0; 161 | send_next_message(); 162 | } 163 | 164 | void message_sender::handle_body_write(const boost::system::error_code& error, 165 | size_t bytes_transferred) { 166 | if (error) { 167 | std::cerr << "[Error:handle_body_write] " << error.message().data() << "\n"; 168 | return; 169 | } 170 | 171 | handle_write_end(); 172 | } 173 | 174 | message_connection::message_connection(boost::asio::io_service& io_service, message_handler& handler) 175 | : socket_(io_service), 176 | resolver_(io_service), 177 | io_service_(io_service), 178 | msg_rx_(message_receiver(this, handler)), 179 | handler_(handler), 180 | is_connected(false) {}; 181 | 182 | boost::asio::ip::tcp::socket& message_connection::get_socket() { 183 | return socket_; 184 | } 185 | 186 | void message_connection::connect(const std::string& server, const std::string& port) { 187 | boost::asio::ip::tcp::resolver::query query(server, port); 188 | resolver_.async_resolve(query, 189 | boost::bind(&message_connection::handle_resolved, this, 190 | boost::asio::placeholders::error, 191 | boost::asio::placeholders::iterator)); 192 | 193 | while (!is_connected.load()); 194 | } 195 | 196 | void message_connection::established() { 197 | boost::asio::ip::tcp::no_delay option(true); 198 | socket_.set_option(option); 199 | 200 | msg_rx_.start(); 201 | ready(); 202 | } 203 | 204 | void message_connection::ready() { 205 | is_connected.store(true); 206 | } 207 | 208 | void message_connection::handle_resolved(const boost::system::error_code& error, 209 | boost::asio::ip::tcp::resolver::iterator endpoint_iterator) { 210 | if (error) { 211 | std::cerr << "[Error:handle_resolved] " << error.message().data() << "\n"; 212 | return; 213 | } 214 | 215 | boost::asio::ip::tcp::endpoint endpoint = *endpoint_iterator; 216 | socket_.async_connect(endpoint, 217 | boost::bind(&message_connection::handle_established, this, 218 | boost::asio::placeholders::error)); 219 | } 220 | 221 | void message_connection::handle_established(const boost::system::error_code& error) { 222 | if (error) { 223 | std::cerr << "[Error:handle_established] " << error.message().data() << "\n"; 224 | return; 225 | } 226 | 227 | established(); 228 | } 229 | } 230 | 231 | -------------------------------------------------------------------------------- /src/network/network.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "tbb/concurrent_queue.h" 8 | 9 | #include 10 | 11 | namespace network { 12 | 13 | class message_connection; 14 | 15 | class message_handler { 16 | public: 17 | virtual message_rx* new_rx_message( 18 | uint64_t hdr_len, 19 | uint64_t body_len, 20 | uint64_t req_id, 21 | uint64_t msg_type) = 0; 22 | 23 | virtual bool completed_receive(message_connection *conn, message_rx *req) = 0; 24 | virtual void completed_transmit(message_connection *conn, message_tx *req) = 0; 25 | }; 26 | 27 | class message_receiver { 28 | public: 29 | message_receiver(message_connection* conn, message_handler& handler); 30 | 31 | void start(); 32 | 33 | void read_new_message(); 34 | 35 | private: 36 | void handle_pre_hdr_read(const boost::system::error_code& error, 37 | size_t bytes_transferred); 38 | 39 | void handle_hdr_read(const boost::system::error_code& error, 40 | size_t bytes_transferred); 41 | 42 | void handle_body_read(const boost::system::error_code& error, 43 | size_t bytes_transferred); 44 | 45 | void handle_read_end(); 46 | 47 | void abort_connection(const char* msg); 48 | 49 | boost::asio::ip::tcp::socket& socket_; 50 | 51 | message_connection* conn_; 52 | message_handler& handler_; 53 | message_rx* res_; 54 | 55 | size_t body_left; 56 | /* header_len, body_len, req_id, message_type */ 57 | uint64_t pre_header[4]; 58 | char header_buf[1024]; 59 | }; 60 | 61 | class message_sender { 62 | public: 63 | message_sender(message_connection* conn, message_handler& handler); 64 | 65 | void send_message(message_tx& req); 66 | 67 | private: 68 | void try_send(); 69 | 70 | void send_next_message(); 71 | 72 | void start_send(message_tx& req); 73 | 74 | void handle_pre_hdr_write(const boost::system::error_code& error, 75 | size_t bytes_transferred); 76 | 77 | void handle_hdr_write(const boost::system::error_code& error, 78 | size_t bytes_transferred); 79 | 80 | void handle_body_write(const boost::system::error_code& error, 81 | size_t bytes_transferred); 82 | 83 | void handle_write_end(); 84 | 85 | void abort_connection(const char* msg); 86 | 87 | boost::asio::ip::tcp::socket& socket_; 88 | message_connection* conn_; 89 | message_handler& handler_; 90 | message_tx* req_; 91 | uint64_t pre_header[4]; 92 | char header_buf[1024]; 93 | 94 | std::mutex queue_mutex; 95 | tbb::concurrent_queue tx_queue_; 96 | }; 97 | 98 | class message_connection { 99 | public: 100 | message_connection(boost::asio::io_service& io_service, message_handler& handler); 101 | 102 | boost::asio::ip::tcp::socket& get_socket(); 103 | 104 | void connect(const std::string& host, const std::string& port); 105 | 106 | void established(); 107 | 108 | void abort_connection(const char* msg); 109 | 110 | void close(const char* reason); 111 | 112 | virtual void ready(); 113 | 114 | private: 115 | void handle_resolved(const boost::system::error_code& error, 116 | boost::asio::ip::tcp::resolver::iterator endpoint_iterator); 117 | 118 | void handle_established(const boost::system::error_code& error); 119 | 120 | boost::asio::ip::tcp::socket socket_; 121 | boost::asio::ip::tcp::resolver resolver_; 122 | message_receiver msg_rx_; 123 | message_handler& handler_; 124 | 125 | protected: 126 | std::atomic_bool is_connected; 127 | 128 | public: 129 | boost::asio::io_service& io_service_; 130 | }; 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/network/server_api.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace network { 4 | 5 | void msg_inference_req_tx::set(serverapi::InferenceRequest& request) { 6 | set_req_id(request.req_id); 7 | msg.set_req_id(request.req_id); 8 | msg.set_model_id(request.model_id); 9 | msg.set_batch_size(request.batch_size); 10 | body_len_ = request.input_size; 11 | body_ = request.input; 12 | } 13 | 14 | void msg_inference_req_rx::get(serverapi::InferenceRequest& request) { 15 | request.req_id = get_rx_req_id(); 16 | request.model_id = msg.model_id(); 17 | request.batch_size = msg.batch_size(); 18 | request.input_size = body_len_; 19 | request.input = body_; 20 | } 21 | 22 | void msg_inference_rsp_tx::set(serverapi::InferenceResponse& response) { 23 | set_req_id(response.req_id); 24 | msg.set_req_id(response.req_id); 25 | msg.set_is_cold(response.is_cold); 26 | } 27 | 28 | void msg_inference_rsp_rx::get(serverapi::InferenceResponse& response) { 29 | response.req_id = get_rx_req_id(); 30 | response.is_cold = msg.is_cold(); 31 | } 32 | 33 | void msg_upload_model_req_tx::set(serverapi::UploadModelRequest& request) { 34 | set_req_id(request.req_id); 35 | msg.set_req_id(request.req_id); 36 | *msg.mutable_model_names() = {request.model_names.begin(), request.model_names.end()}; 37 | msg.set_n_models(request.n_models); 38 | msg.set_engine_type(request.engine_type); 39 | msg.set_mp_size(request.mp_size); 40 | } 41 | 42 | void msg_upload_model_req_rx::get(serverapi::UploadModelRequest& request) { 43 | request.req_id = get_rx_req_id(); 44 | request.model_names = std::vector(msg.model_names().begin(), msg.model_names().end()); 45 | request.n_models = msg.n_models(); 46 | request.engine_type = msg.engine_type(); 47 | request.mp_size = msg.mp_size(); 48 | } 49 | 50 | void msg_upload_model_rsp_tx::set(serverapi::UploadModelResponse& response) { 51 | set_req_id(response.req_id); 52 | msg.set_req_id(response.req_id); 53 | } 54 | 55 | void msg_upload_model_rsp_rx::get(serverapi::UploadModelResponse& response) { 56 | response.req_id = get_rx_req_id(); 57 | } 58 | 59 | void msg_close_req_tx::set(serverapi::CloseRequest& request) { 60 | set_req_id(request.req_id); 61 | msg.set_req_id(request.req_id); 62 | } 63 | 64 | void msg_close_req_rx::get(serverapi::CloseRequest& request) { 65 | request.req_id = get_rx_req_id(); 66 | } 67 | 68 | void msg_close_rsp_tx::set(serverapi::CloseResponse& response) { 69 | set_req_id(response.req_id); 70 | msg.set_req_id(response.req_id); 71 | } 72 | 73 | void msg_close_rsp_rx::get(serverapi::CloseResponse& response) { 74 | response.req_id = get_rx_req_id(); 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/network/server_api.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace network { 8 | 9 | class msg_inference_req_tx : public msg_protobuf_tx_with_body { 10 | public: 11 | virtual void set(serverapi::InferenceRequest& request); 12 | }; 13 | 14 | class msg_inference_req_rx : public msg_protobuf_rx_with_body { 15 | public: 16 | virtual void get(serverapi::InferenceRequest& request); 17 | }; 18 | 19 | class msg_inference_rsp_tx : public msg_protobuf_tx { 20 | public: 21 | virtual void set(serverapi::InferenceResponse& response); 22 | }; 23 | 24 | class msg_inference_rsp_rx : public msg_protobuf_rx { 25 | public: 26 | virtual void get(serverapi::InferenceResponse& response); 27 | }; 28 | 29 | class msg_upload_model_req_tx : public msg_protobuf_tx { 30 | public: 31 | virtual void set(serverapi::UploadModelRequest& request); 32 | }; 33 | 34 | class msg_upload_model_req_rx : public msg_protobuf_rx { 35 | public: 36 | virtual void get(serverapi::UploadModelRequest& request); 37 | }; 38 | 39 | class msg_upload_model_rsp_tx : public msg_protobuf_tx { 40 | public: 41 | virtual void set(serverapi::UploadModelResponse& response); 42 | }; 43 | 44 | class msg_upload_model_rsp_rx : public msg_protobuf_rx { 45 | public: 46 | virtual void get(serverapi::UploadModelResponse& response); 47 | }; 48 | 49 | class msg_close_req_tx : public msg_protobuf_tx { 50 | public: 51 | virtual void set(serverapi::CloseRequest& request); 52 | }; 53 | 54 | class msg_close_req_rx : public msg_protobuf_rx { 55 | public: 56 | virtual void get(serverapi::CloseRequest& request); 57 | }; 58 | 59 | class msg_close_rsp_tx : public msg_protobuf_tx { 60 | public: 61 | virtual void set(serverapi::CloseResponse& response); 62 | }; 63 | 64 | class msg_close_rsp_rx : public msg_protobuf_rx { 65 | public: 66 | virtual void get(serverapi::CloseResponse& response); 67 | }; 68 | 69 | } 70 | 71 | -------------------------------------------------------------------------------- /src/network/session.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | namespace network { 7 | 8 | SrvSession::SrvSession(boost::asio::io_service& io_service, MessageQueue& messages) 9 | : Session(io_service), 10 | messages_(messages) {}; 11 | 12 | message_rx* SrvSession::new_rx_message(uint64_t hdr_len, uint64_t body_len, 13 | uint64_t req_id, uint64_t msg_type) { 14 | message_rx* msg_rx; 15 | 16 | if (msg_type == REQ_INFERENCE) { 17 | auto msg = new msg_inference_req_rx(); 18 | msg->set_req_id(req_id); 19 | msg->set_body_len(body_len); 20 | 21 | msg_rx = msg; 22 | } 23 | else if (msg_type == REQ_UPLOAD_MODEL) { 24 | auto msg = new msg_upload_model_req_rx(); 25 | msg->set_req_id(req_id); 26 | 27 | msg_rx = msg; 28 | } 29 | else if(msg_type == REQ_CLOSE) { 30 | auto msg = new msg_close_req_rx(); 31 | msg->set_req_id(req_id); 32 | 33 | msg_rx = msg; 34 | } 35 | 36 | return msg_rx; 37 | } 38 | 39 | bool SrvSession::completed_receive(message_connection* conn, message_rx* req) { 40 | bool is_continue = true; 41 | 42 | if (auto infer = dynamic_cast(req)) { 43 | auto request = new serverapi::InferenceRequest(); 44 | infer->get(*request); 45 | 46 | messages_.push({this, request}); 47 | } 48 | else if (auto upload_model = dynamic_cast(req)) { 49 | auto request = new serverapi::UploadModelRequest(); 50 | upload_model->get(*request); 51 | 52 | messages_.push({this, request}); 53 | } 54 | else if (auto close = dynamic_cast(req)) { 55 | auto response = new serverapi::CloseResponse(); 56 | 57 | response->req_id = req->get_rx_req_id(); 58 | 59 | send_response(response); 60 | 61 | is_continue = false; 62 | } 63 | 64 | delete req; 65 | 66 | return is_continue; 67 | } 68 | 69 | void SrvSession::completed_transmit(message_connection* conn, message_tx* req) { 70 | } 71 | 72 | void SrvSession::send_response(serverapi::Response* response) { 73 | message_tx* msg_tx; 74 | 75 | if (auto infer = dynamic_cast(response)) { 76 | auto infer_rsp = new msg_inference_rsp_tx(); 77 | 78 | infer_rsp->set(*infer); 79 | msg_tx = infer_rsp; 80 | } 81 | else if (auto upload_model = dynamic_cast(response)) { 82 | auto upload_model_rsp = new msg_upload_model_rsp_tx(); 83 | 84 | upload_model_rsp->set(*upload_model); 85 | msg_tx = upload_model_rsp; 86 | } 87 | else if (auto close = dynamic_cast(response)) { 88 | auto close_rsp = new msg_close_rsp_tx(); 89 | 90 | close_rsp->set(*close); 91 | msg_tx = close_rsp; 92 | } 93 | 94 | msg_tx_.send_message(*msg_tx); 95 | } 96 | 97 | ClientSession::ClientSession(boost::asio::io_service& io_service) 98 | : Session(io_service), 99 | request_seed_id(0), 100 | received_rsp_cnt(0) {} 101 | 102 | std::future ClientSession::send_request_async(serverapi::Request& request, std::function onSuccess) { 103 | auto promise = std::make_shared>(); 104 | auto cb = [this, promise, onSuccess](serverapi::Response* response) { 105 | onSuccess(response); 106 | promise->set_value(response); 107 | }; 108 | 109 | message_tx* msg_tx; 110 | 111 | uint64_t request_id = request_seed_id++; 112 | 113 | request.req_id = request_id; 114 | requests[request_id] = cb; 115 | 116 | if (auto infer = dynamic_cast(&request)) { 117 | auto infer_req = new msg_inference_req_tx(); 118 | 119 | infer_req->set(*infer); 120 | msg_tx = infer_req; 121 | } 122 | else if (auto upload_model = dynamic_cast(&request)) { 123 | auto upload_model_req = new msg_upload_model_req_tx(); 124 | 125 | upload_model_req->set(*upload_model); 126 | msg_tx = upload_model_req; 127 | } 128 | else if (auto close = dynamic_cast(&request)) { 129 | auto close_req = new msg_close_req_tx(); 130 | 131 | close_req->set(*close); 132 | msg_tx = close_req; 133 | } 134 | 135 | msg_tx_.send_message(*msg_tx); 136 | 137 | return promise->get_future(); 138 | } 139 | 140 | serverapi::Response* ClientSession::send_request(serverapi::Request& request, std::function onSuccess) { 141 | return send_request_async(request, onSuccess).get(); 142 | } 143 | 144 | void ClientSession::await_completion() { 145 | while (request_seed_id > received_rsp_cnt) { 146 | usleep(100000); 147 | } 148 | 149 | return; 150 | } 151 | 152 | message_rx* ClientSession::new_rx_message(uint64_t hdr_len, uint64_t body_len, 153 | uint64_t req_id, uint64_t msg_type) { 154 | message_rx* msg_rx; 155 | 156 | if (msg_type == RSP_INFERENCE) { 157 | auto msg = new msg_inference_rsp_rx(); 158 | msg->set_req_id(req_id); 159 | 160 | msg_rx = msg; 161 | } 162 | else if (msg_type == RSP_UPLOAD_MODEL) { 163 | auto msg = new msg_upload_model_rsp_rx(); 164 | msg->set_req_id(req_id); 165 | 166 | msg_rx = msg; 167 | } 168 | else if (msg_type == RSP_CLOSE) { 169 | auto msg = new msg_close_rsp_rx(); 170 | msg->set_req_id(req_id); 171 | 172 | msg_rx = msg; 173 | } 174 | 175 | return msg_rx; 176 | } 177 | 178 | bool ClientSession::completed_receive(message_connection* conn, message_rx* req) { 179 | uint64_t req_id = req->get_rx_req_id(); 180 | serverapi::Response* response; 181 | bool is_continue = true; 182 | 183 | if (auto infer = dynamic_cast(req)) { 184 | auto response_ = new serverapi::InferenceResponse(); 185 | infer->get(*response_); 186 | response = response_; 187 | } 188 | else if (auto upload_model = dynamic_cast(req)) { 189 | auto response_ = new serverapi::UploadModelResponse(); 190 | upload_model->get(*response_); 191 | response = response_; 192 | } 193 | else if (auto close = dynamic_cast(req)) { 194 | auto response_ = new serverapi::CloseResponse(); 195 | close->get(*response_); 196 | 197 | is_continue = false; 198 | response = response_; 199 | } 200 | 201 | requests[req_id](response); 202 | received_rsp_cnt++; 203 | 204 | delete req; 205 | 206 | return is_continue; 207 | } 208 | 209 | void ClientSession::completed_transmit(message_connection* conn, message_tx* req) { 210 | } 211 | 212 | } 213 | -------------------------------------------------------------------------------- /src/network/session.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "deepcache.pb.h" 10 | #include "tbb/concurrent_queue.h" 11 | 12 | namespace network { 13 | 14 | class Session : public message_connection, message_handler { 15 | public: 16 | Session(boost::asio::io_service& io_service) 17 | : message_connection(io_service, *this), 18 | msg_tx_(this, *this) {}; 19 | 20 | virtual message_rx* new_rx_message(uint64_t hdr_len, uint64_t body_len, 21 | uint64_t req_id, uint64_t msg_type) {}; 22 | 23 | virtual bool completed_receive(message_connection* conn, message_rx* req) {}; 24 | 25 | virtual void completed_transmit(message_connection* conn, message_tx* req) {}; 26 | 27 | protected: 28 | message_sender msg_tx_; 29 | }; 30 | 31 | class SrvSession; 32 | 33 | struct Message { SrvSession* srv_session; serverapi::Request* req; }; 34 | 35 | typedef tbb::concurrent_bounded_queue MessageQueue; 36 | 37 | class SrvSession : public Session { 38 | public: 39 | SrvSession(boost::asio::io_service& io_service, 40 | MessageQueue& messages); 41 | 42 | void send_response(serverapi::Response* response); 43 | 44 | message_rx* new_rx_message(uint64_t hdr_len, uint64_t body_len, 45 | uint64_t req_id, uint64_t msg_type); 46 | 47 | bool completed_receive(message_connection* conn, message_rx* req); 48 | 49 | void completed_transmit(message_connection* conn, message_tx* req); 50 | 51 | private: 52 | MessageQueue& messages_; 53 | }; 54 | 55 | class ClientSession : public Session { 56 | public: 57 | ClientSession(boost::asio::io_service& io_service); 58 | 59 | std::future send_request_async(serverapi::Request& request, std::function onSuccess); 60 | 61 | serverapi::Response* send_request(serverapi::Request& request, std::function onSuccess); 62 | 63 | void await_completion(); 64 | 65 | message_rx* new_rx_message(uint64_t hdr_len, uint64_t body_len, 66 | uint64_t req_id, uint64_t msg_type); 67 | 68 | bool completed_receive(message_connection* conn, message_rx* req); 69 | 70 | void completed_transmit(message_connection* conn, message_tx* req); 71 | 72 | private: 73 | std::atomic_int request_seed_id; 74 | std::atomic_int received_rsp_cnt; 75 | std::map> requests; 76 | }; 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/server.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | class InterruptException : public std::exception 5 | { 6 | public: 7 | InterruptException(int s) : S(s) {} 8 | int S; 9 | }; 10 | 11 | void sig_to_exception(int s) 12 | { 13 | throw InterruptException(s); 14 | } 15 | 16 | int main(int argc, char** argv) { 17 | { 18 | // setupt handling interrupt 19 | struct sigaction sigIntHandler; 20 | sigIntHandler.sa_handler = sig_to_exception; 21 | sigemptyset(&sigIntHandler.sa_mask); 22 | sigIntHandler.sa_flags = 0; 23 | sigaction(SIGINT, &sigIntHandler, NULL); 24 | } 25 | 26 | Server* server; 27 | try { 28 | server = new Server(DEFAULT_PORT); 29 | server->run(); 30 | } 31 | catch(InterruptException& e) { 32 | server->shutdown(); 33 | } 34 | catch (std::exception& e) { 35 | std::cerr << "Exception: " << e.what() << "\n"; 36 | } 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /src/server/controller.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | Controller::Controller(network::MessageQueue& messages) 10 | : messages_(messages), 11 | alive(false) {init();}; 12 | 13 | void Controller::init() { 14 | deepplan::Init(); 15 | 16 | alive = true; 17 | ctrl_thr = std::thread(std::bind(&Controller::run, this)); 18 | 19 | int rank = torch::cuda::device_count(); 20 | workers.resize(rank); 21 | for (int i = 0; i < workers.size(); i++) { 22 | workers[i] = new Worker(i); 23 | } 24 | } 25 | 26 | void Controller::run() { 27 | while (alive) { 28 | network::Message message; 29 | 30 | if (messages_.try_pop(message)) { 31 | if (auto infer = dynamic_cast(message.req)) { 32 | int model_id = infer->model_id; 33 | int n_workers = workers.size(); 34 | int worker_id; 35 | 36 | worker_id = model_id % n_workers; 37 | infer->model_id = model_id / n_workers; 38 | 39 | auto cb = [message](serverapi::InferenceResponse* response) { 40 | message.srv_session->send_response(response); 41 | }; 42 | 43 | workers[worker_id]->infer(infer, cb); 44 | } 45 | else if (auto upload_model = dynamic_cast(message.req)) { 46 | std::vector model_names = upload_model->model_names; 47 | int n_models = upload_model->n_models; 48 | EngineType engine_type = static_cast(upload_model->engine_type); 49 | int mp_size = upload_model->mp_size; 50 | 51 | auto response = new serverapi::UploadModelResponse(); 52 | 53 | setup_models(model_names, n_models, engine_type, mp_size); 54 | 55 | response->req_id = upload_model->req_id; 56 | message.srv_session->send_response(response); 57 | } 58 | } 59 | } 60 | } 61 | 62 | void Controller::setup_models(std::vector model_names, int n_models, EngineType engine_type, int mp_size) { 63 | bool should_setup = false; 64 | 65 | // Update if the setting parameters are different 66 | if ((model_names_ != model_names) || 67 | (n_models_ < n_models) || 68 | (engine_type_ != engine_type) || 69 | (mp_size_ != mp_size)) { 70 | should_setup = true; 71 | } 72 | 73 | if (should_setup) { 74 | int n_workers = workers.size(); 75 | int n_models_per_worker = n_models / n_workers; 76 | std::vector> partitions(n_workers); 77 | 78 | for (int i = 0; i < n_workers; i++) { 79 | std::vector p; 80 | for (int d = 0; d < mp_size; d++) 81 | p.push_back((i + 2*d) % n_workers); 82 | 83 | partitions[i] = p; 84 | } 85 | 86 | std::cout << "Models setup...\n"; 87 | for (int i = 0; i < n_workers; i++) { 88 | workers[i]->reset_model(); 89 | workers[i]->init_model(model_names, n_models_per_worker, 90 | engine_type, partitions[i]); 91 | } 92 | 93 | model_names_ = model_names; 94 | n_models_ = n_models; 95 | engine_type_ = engine_type; 96 | mp_size_ = mp_size; 97 | 98 | std::cout << "Modele setup complete\n"; 99 | } 100 | else return; 101 | 102 | } 103 | 104 | void Controller::shutdown() { 105 | alive = false; 106 | if (ctrl_thr.joinable()) 107 | ctrl_thr.join(); 108 | 109 | for (auto worker : workers) 110 | worker->stop(); 111 | } 112 | -------------------------------------------------------------------------------- /src/server/controller.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | class Controller { 12 | public: 13 | Controller(network::MessageQueue& messages); 14 | 15 | void init(); 16 | 17 | void run(); 18 | 19 | void shutdown(); 20 | 21 | void setup_models(std::vector model_name, int n_models, EngineType engine_type, int mp_size); 22 | 23 | private: 24 | std::atomic_bool alive; 25 | 26 | std::vector workers; 27 | 28 | network::MessageQueue& messages_; 29 | 30 | std::thread ctrl_thr; 31 | 32 | std::vector model_names_; 33 | int n_models_ = 0; 34 | int mp_size_ = 0; 35 | EngineType engine_type_ = EngineType::NONE; 36 | }; 37 | -------------------------------------------------------------------------------- /src/server/model_manager.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | void ModelManager::add_model(std::string model_name, std::vector devices) { 6 | deepplan::Model* model = new deepplan::Model(model_name, engine_type, devices); 7 | 8 | models.push_back(std::move(model)); 9 | } 10 | 11 | deepplan::Model* ModelManager::get_model(int model_id) { 12 | return models[model_id]; 13 | } 14 | 15 | void ModelManager::clear() { 16 | for (auto model : models) { 17 | model->clear(); 18 | delete model; 19 | } 20 | } 21 | 22 | size_t getDeviceActiveMemorySize(int device){ 23 | using c10::cuda::CUDACachingAllocator::StatArray; 24 | using c10::cuda::CUDACachingAllocator::DeviceStats; 25 | 26 | const DeviceStats stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device); 27 | 28 | return stats.active_bytes[0].current; 29 | } 30 | -------------------------------------------------------------------------------- /src/server/model_manager.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | size_t getDeviceActiveMemorySize(int deivce); 7 | 8 | class ModelManager { 9 | public: 10 | ModelManager(EngineType engine_type) 11 | : engine_type(engine_type) {}; 12 | 13 | void add_model(std::string model_name, std::vector devices); 14 | 15 | deepplan::Model* get_model(int model_id); 16 | 17 | void clear(); 18 | 19 | EngineType engine_type; 20 | 21 | private: 22 | std::vector models; 23 | }; 24 | -------------------------------------------------------------------------------- /src/server/server.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | Server::Server(int port) 7 | : io_service_(), 8 | acceptor_(io_service_, boost::asio::ip::tcp::endpoint(boost::asio::ip::tcp::v4(), port)), 9 | alive(false) {}; 10 | 11 | Server::~Server() { 12 | shutdown(); 13 | } 14 | 15 | void Server::run() { 16 | controller = new Controller(messages); 17 | 18 | start_accept(); 19 | 20 | alive = true; 21 | 22 | std::cout << "Server Ready\n"; 23 | io_service_.run(); 24 | } 25 | 26 | void Server::shutdown() { 27 | if (alive) { 28 | std::cout << "Closing Server\n"; 29 | alive = false; 30 | controller->shutdown(); 31 | 32 | boost::system::error_code ec; 33 | acceptor_.close(ec); 34 | if (ec){ 35 | std::cerr << "Acceptor Error occured\n"; 36 | } 37 | // If connecting session, close the session. 38 | } 39 | } 40 | 41 | void Server::send_response(serverapi::Response* response) { 42 | current_session->send_response(response); 43 | } 44 | 45 | void Server::start_accept() { 46 | network::SrvSession* new_session = new network::SrvSession(io_service_, messages); 47 | acceptor_.async_accept(new_session->get_socket(), 48 | boost::bind(&Server::handle_accept, this, new_session, 49 | boost::asio::placeholders::error)); 50 | } 51 | 52 | void Server::handle_accept(network::SrvSession* new_session, 53 | const boost::system::error_code& error) { 54 | if (error) { 55 | std::cerr << "[Error] " << error.message() << std::endl; 56 | delete new_session; 57 | return; 58 | } 59 | 60 | // FIXME: Should enable to handle multi client 61 | new_session->established(); 62 | current_session = new_session; 63 | 64 | // wait when the connection from the client is disconnected. 65 | start_accept(); 66 | } 67 | -------------------------------------------------------------------------------- /src/server/server.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "tbb/concurrent_queue.h" 12 | 13 | #define DEFAULT_PORT 4321 14 | 15 | class Server { 16 | public: 17 | Server(int port); 18 | 19 | ~Server(); 20 | 21 | void init(); 22 | 23 | void run(); 24 | 25 | void send_response(serverapi::Response* response); 26 | 27 | void shutdown(); 28 | 29 | private: 30 | void start_accept(); 31 | 32 | void handle_accept(network::SrvSession* new_session, const boost::system::error_code& error); 33 | 34 | boost::asio::io_service io_service_; 35 | boost::asio::ip::tcp::acceptor acceptor_; 36 | 37 | network::MessageQueue messages; 38 | 39 | network::SrvSession* current_session; 40 | 41 | Controller* controller; 42 | 43 | std::atomic_bool alive; 44 | }; 45 | -------------------------------------------------------------------------------- /src/server/worker.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | Worker::Worker(int device) 8 | : device(at::kCUDA, device), 9 | alive(true) { 10 | worker_thr = std::thread(std::bind(&Worker::run, this)); 11 | } 12 | 13 | void Worker::run() { 14 | torch::NoGradGuard no_grad; 15 | 16 | InferTask task; 17 | 18 | while (alive) { 19 | while (queue_.try_pop(task)) { 20 | auto request = task.request; 21 | auto response = new serverapi::InferenceResponse(); 22 | bool is_cold = false; 23 | 24 | int model_id = request->model_id; 25 | deepplan::Model* model; 26 | 27 | if (running_models->exist(model_id)) { 28 | model = running_models->get(model_id); 29 | } 30 | else { 31 | auto new_model = model_manager->get_model(request->model_id); 32 | 33 | while (getDeviceActiveMemorySize(device.index()) >= capacity_) { 34 | auto evict_model = running_models->pop(); 35 | evict_model->clear(); 36 | } 37 | 38 | is_cold = true; 39 | running_models->put(model_id, new_model); 40 | model = new_model; 41 | } 42 | 43 | ScriptModuleInput inputs; 44 | 45 | for (auto input_config : model->input_configs) { 46 | inputs.push_back( 47 | input_config.get(request->input, request->batch_size).to(device)); 48 | } 49 | 50 | model->forward(inputs); 51 | 52 | torch::cuda::synchronize(device.index()); 53 | 54 | response->req_id = request->req_id; 55 | response->is_cold = is_cold; 56 | task.cb(response); 57 | } 58 | } 59 | } 60 | 61 | void Worker::init_model(std::vector model_names, int n_models, 62 | EngineType engine_type, std::vector devices) { 63 | if (model_manager == nullptr) { 64 | size_t free; 65 | size_t total; 66 | size_t padding_size = (size_t)(5.5 * (1 << 30)); // 6GB 67 | int n_models_per = n_models / model_names.size(); 68 | 69 | model_manager = new ModelManager(engine_type); 70 | for (auto model_name : model_names) 71 | for (int i = 0; i < n_models_per; i++) 72 | model_manager->add_model(model_name, devices); 73 | 74 | cudaError_t err = cudaMemGetInfo(&free, &total); 75 | if (err != cudaSuccess) { 76 | throw "cudaMemGetInfo Error\n"; 77 | } 78 | 79 | capacity_ = total - padding_size; 80 | running_models = new LRUCache(); 81 | } 82 | } 83 | 84 | void Worker::reset_model() { 85 | if (model_manager) { 86 | model_manager->clear(); 87 | delete model_manager; 88 | model_manager = nullptr; 89 | delete running_models; 90 | } 91 | } 92 | 93 | void Worker::stop() { 94 | alive = false; 95 | if (worker_thr.joinable()) 96 | worker_thr.join(); 97 | 98 | reset_model(); 99 | } 100 | 101 | void Worker::infer( 102 | serverapi::InferenceRequest* request, 103 | std::function cb) { 104 | InferTask task(request, cb); 105 | queue_.push(task); 106 | } 107 | -------------------------------------------------------------------------------- /src/server/worker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "tbb/concurrent_queue.h" 7 | 8 | struct InferTask { 9 | InferTask() {}; 10 | InferTask( 11 | serverapi::InferenceRequest* request, 12 | std::function cb) 13 | : request(request), 14 | cb(cb) {}; 15 | 16 | serverapi::InferenceRequest* request; 17 | std::function cb; 18 | }; 19 | 20 | template 21 | class LRUCache { 22 | public: 23 | bool put(const K& k, const V& v) { 24 | if(exist(k)) { 25 | return false; 26 | } 27 | 28 | items.emplace_front(k, v); 29 | 30 | index.emplace(k, items.begin()); 31 | } 32 | 33 | bool exist(const K& k) { 34 | return (index.count(k)>0); 35 | } 36 | 37 | V get(const K& k) { 38 | assert(exist(k)); 39 | auto itr = index.find(k); 40 | 41 | items.splice(items.begin(), items, itr->second); 42 | 43 | return itr->second->second; 44 | } 45 | 46 | V pop() { 47 | auto v = items.back().second; 48 | index.erase(items.back().first); 49 | items.pop_back(); 50 | 51 | return v; 52 | } 53 | 54 | size_t size() { 55 | return index.size(); 56 | } 57 | private: 58 | std::list> items; 59 | 60 | std::unordered_map>::iterator> index; 61 | }; 62 | 63 | class Worker { 64 | public: 65 | Worker(int device); 66 | 67 | void run(); 68 | 69 | void infer( 70 | serverapi::InferenceRequest* request, 71 | std::function cb); 72 | 73 | void init_model(std::vector model_names, int n_models, 74 | EngineType engine_type, std::vector devices); 75 | 76 | void reset_model(); 77 | 78 | void stop(); 79 | 80 | at::Device device; 81 | 82 | private: 83 | size_t capacity_; 84 | std::atomic_bool alive; 85 | std::thread worker_thr; 86 | ModelManager* model_manager = nullptr; 87 | LRUCache* running_models; 88 | tbb::concurrent_queue queue_; 89 | }; 90 | -------------------------------------------------------------------------------- /src/server_api.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace serverapi { 4 | 5 | struct Request { 6 | public: 7 | virtual ~Request() {}; 8 | uint64_t req_id; 9 | }; 10 | 11 | struct Response { 12 | public: 13 | virtual ~Response() {}; 14 | uint64_t req_id; 15 | }; 16 | 17 | struct InferenceRequest : public Request { 18 | public: 19 | uint32_t model_id; 20 | uint32_t batch_size; 21 | size_t input_size; 22 | void* input; 23 | }; 24 | 25 | struct InferenceResponse : public Response { 26 | public: 27 | bool is_cold; 28 | }; 29 | 30 | struct UploadModelRequest : public Request { 31 | public: 32 | std::vector model_names; 33 | uint32_t n_models; 34 | uint32_t engine_type; 35 | uint32_t mp_size; 36 | }; 37 | 38 | struct UploadModelResponse : public Response { 39 | }; 40 | 41 | struct CloseRequest : public Request { 42 | }; 43 | 44 | struct CloseResponse : public Response { 45 | }; 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace util { 4 | 5 | std::uint64_t now() { 6 | return nanos(hrt()); 7 | } 8 | 9 | time_point hrt() 10 | { 11 | return std::chrono::steady_clock::now(); 12 | } 13 | 14 | time_point epoch = hrt(); 15 | 16 | uint64_t epoch_time = std::chrono::duration_cast( 17 | std::chrono::system_clock::now().time_since_epoch()).count(); 18 | 19 | 20 | std::uint64_t nanos(time_point t) { 21 | return std::chrono::duration_cast(t - epoch).count() + epoch_time; 22 | } 23 | 24 | size_t getModuleSize(ScriptModule module, bool ignore_cuda) { 25 | size_t size = 0; 26 | for (auto param : module.parameters()) { 27 | if (ignore_cuda && param.is_cuda()) continue; 28 | size += param.nbytes(); 29 | } 30 | return size; 31 | } 32 | 33 | 34 | InputGenerator::InputGenerator(const char* model_repo) 35 | : model_repo_(model_repo) {assert(model_repo);} 36 | 37 | InputGenerator::InputGenerator() 38 | : model_repo_(getenv("PLAN_REPO")) {assert(model_repo_);} 39 | 40 | 41 | void InputGenerator::extend_rdata(DataType data_type, size_t size) { 42 | auto it = rdata_map.find(data_type); 43 | auto& data = it->second; 44 | char rdata[ALIGN(size)]; 45 | 46 | switch (data_type) { 47 | case TYPE_FP32: 48 | { 49 | for (int i = 0; i < size; i += sizeof(float)) { 50 | float value = (float)rand() / RAND_MAX; 51 | memcpy(rdata+i, &value, sizeof(float)); 52 | } 53 | break; 54 | } 55 | case TYPE_INT64: 56 | { 57 | for (int i = 0; i < size; i += sizeof(int64_t)) { 58 | int64_t value = (int64_t)rand() % 30522; 59 | memcpy(rdata+i, &value, sizeof(int64_t)); 60 | } 61 | break; 62 | } 63 | default: 64 | throw std::runtime_error("Incorrect DataType"); 65 | break; 66 | } 67 | 68 | data.insert(data.begin(), rdata, rdata+sizeof(rdata)); 69 | } 70 | 71 | void InputGenerator::generate_rdata(size_t size, DataType data_type, char** buf_ptr) { 72 | *buf_ptr = new char[size]; 73 | generate_rdata(size, data_type, *buf_ptr); 74 | } 75 | 76 | void InputGenerator::generate_rdata(size_t size, DataType data_type, char* buf) { 77 | auto it = rdata_map.find(data_type); 78 | std::vector rdata; 79 | 80 | if (it == rdata_map.end()) { 81 | it = rdata_map.insert({data_type, {}}).first; 82 | } 83 | 84 | if (it->second.size() < size) { 85 | size_t extend_size = std::max((size_t)STEP_SIZE, size); 86 | extend_rdata(data_type, extend_size); 87 | } 88 | 89 | rdata = it->second; 90 | 91 | // TODO select random range 92 | std::memcpy(buf, rdata.data(), size); 93 | } 94 | 95 | void InputGenerator::add_input_config(const std::string& model_name) { 96 | std::vector input_configs; 97 | ModelConfig model_config; 98 | std::string model_prefix; 99 | std::string config_path; 100 | 101 | model_prefix = std::string(model_repo_) + "/" + model_name; 102 | config_path = model_prefix + "/config.pbtxt"; 103 | 104 | try { 105 | if (!util::read_from_pbtxt(model_config, config_path)) { 106 | std::stringstream msg; 107 | msg << "Failed to read " << config_path; 108 | throw std::runtime_error(msg.str()); 109 | } 110 | } 111 | catch (const std::exception& e) { 112 | std::cerr << e.what() << "\n"; 113 | throw e; 114 | } 115 | 116 | for (auto io : model_config.inputs()) { 117 | input_configs.emplace_back(io); 118 | } 119 | 120 | input_config_map.insert(std::make_pair(model_name, input_configs)); 121 | } 122 | 123 | void InputGenerator::generate_input(std::string model_name, int batch_size, ScriptModuleInput* out) { 124 | ScriptModuleInput inputs; 125 | std::vector input_configs; 126 | 127 | auto it = input_config_map.find(model_name); 128 | if (it == input_config_map.end()) { 129 | add_input_config(model_name); 130 | it = input_config_map.find(model_name); 131 | } 132 | 133 | input_configs = it->second; 134 | 135 | for (auto input_config : input_configs) { 136 | auto shape = input_config.shape; 137 | shape.insert(shape.begin(), batch_size); 138 | 139 | size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); 140 | char* data; 141 | 142 | auto options = torch::TensorOptions(); 143 | switch (input_config.data_type) { 144 | case TYPE_FP32: 145 | size *= sizeof(float); 146 | options = options.dtype(torch::kFloat32); 147 | break; 148 | case TYPE_INT64: 149 | size *= sizeof(int64_t); 150 | options = options.dtype(torch::kInt64); 151 | break; 152 | } 153 | generate_rdata(size, input_config.data_type, &data); 154 | 155 | inputs.push_back(torch::from_blob(data, shape, options)); 156 | } 157 | 158 | 159 | *out = inputs; 160 | } 161 | 162 | // FIXME: Maybe convert double ptr 163 | void InputGenerator::generate_input(std::string model_name, int batch_size, std::vector* out) { 164 | std::vector inputs; 165 | std::vector input_configs; 166 | 167 | auto it = input_config_map.find(model_name); 168 | if (it == input_config_map.end()) { 169 | add_input_config(model_name); 170 | it = input_config_map.find(model_name); 171 | } 172 | 173 | input_configs = it->second; 174 | 175 | for (auto input_config : input_configs) { 176 | auto shape = input_config.shape; 177 | shape.insert(shape.begin(), batch_size); 178 | 179 | size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); 180 | char* data; 181 | 182 | switch (input_config.data_type) { 183 | case TYPE_FP32: 184 | size *= sizeof(float); 185 | break; 186 | case TYPE_INT64: 187 | size *= sizeof(int64_t); 188 | break; 189 | } 190 | 191 | generate_rdata(size, input_config.data_type, &data); 192 | inputs.insert(inputs.end(), data, data+size); 193 | } 194 | 195 | *out = inputs; 196 | } 197 | 198 | } 199 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | typedef std::vector ScriptModuleInput; 13 | typedef torch::jit::script::Module ScriptModule; 14 | 15 | #define STEP_SIZE (1024*1024) 16 | #define ALIGNMENT 8 17 | #define ALIGN(size) (((size) + (ALIGNMENT-1)) & ~(ALIGNMENT-1)) 18 | 19 | typedef enum 20 | { 21 | IN_MEMORY = 0, 22 | ON_DEMAND, 23 | PIPESWITCH, 24 | DEEPPLAN, 25 | DEEPCACHE, 26 | NONE, 27 | } EngineType; 28 | 29 | struct InputConfig { 30 | public: 31 | InputConfig(ModelInput io) 32 | : shape(io.shape().begin(), io.shape().end()), 33 | data_type(io.data_type()) {}; 34 | 35 | std::vector shape; 36 | DataType data_type; 37 | 38 | at::Tensor get(void* data, int batch_size) { 39 | auto shape_ = shape; 40 | shape_.insert(shape_.begin(), batch_size); 41 | 42 | auto options = torch::TensorOptions(); 43 | switch (data_type) { 44 | case TYPE_FP32: 45 | options = options.dtype(torch::kFloat32); 46 | break; 47 | case TYPE_INT64: 48 | options = options.dtype(torch::kInt64); 49 | break; 50 | } 51 | return torch::from_blob(data, shape_, options); 52 | } 53 | }; 54 | 55 | namespace util { 56 | 57 | typedef std::chrono::steady_clock::time_point time_point; 58 | 59 | time_point hrt(); 60 | 61 | std::uint64_t now(); 62 | 63 | std::uint64_t nanos(time_point t); 64 | 65 | template 66 | bool read_from_pbtxt(T& config, const std::string path) { 67 | std::ifstream fin(path); 68 | if (!fin.is_open()) return false; 69 | std::stringstream ss; 70 | ss << fin.rdbuf(); 71 | return google::protobuf::TextFormat::ParseFromString(ss.str(), &config); 72 | } 73 | 74 | size_t getModuleSize(ScriptModule module, bool ignore_cuda=false); 75 | 76 | class InputGenerator { 77 | public: 78 | InputGenerator(); 79 | 80 | InputGenerator(const char* model_repo); 81 | 82 | void generate_input(std::string model_name, int batch_size, ScriptModuleInput* out); 83 | 84 | void generate_input(std::string model_name, int batch_size, std::vector* out); 85 | 86 | private: 87 | void generate_rdata(size_t size, DataType data_type, char** buf_ptr); 88 | 89 | void generate_rdata(size_t size, DataType data_type, char* buf); 90 | 91 | void extend_rdata(DataType data_type, size_t size); 92 | 93 | void add_input_config(const std::string& model_name); 94 | 95 | std::map> input_config_map; 96 | std::map> rdata_map; 97 | 98 | const char* model_repo_; 99 | }; 100 | 101 | } // namespace util 102 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import numpy as np 4 | from google.protobuf import text_format 5 | 6 | def read_from_pbtxt(config, path): 7 | if os.path.isfile(path): 8 | with open(path, 'r') as f: 9 | text_format.Parse(f.read(), config) 10 | f.close() 11 | 12 | def write_to_pbtxt(config, path): 13 | with open(path, 'w') as f: 14 | f.write(text_format.MessageToString(config, use_short_repeated_primitives=True)) 15 | f.close() 16 | 17 | def travel_layers(mod, name_path=None): 18 | layers = [] 19 | if name_path is None: 20 | name_path = mod.__class__.__qualname__ 21 | 22 | if len(list(mod.children())) == 0: 23 | if isinstance(mod, torch.nn.Dropout): 24 | return [] 25 | 26 | _name_path = f"{name_path}.{mod.__class__.__qualname__}" 27 | setattr(mod, "__qualname__", _name_path) 28 | 29 | return [mod] 30 | else: 31 | for i, (name, child) in enumerate(mod.named_children()): 32 | _name_path = f"{name_path}.{child.__class__.__qualname__}{i}" 33 | 34 | layers += travel_layers(child, _name_path) 35 | 36 | return layers 37 | 38 | def get_module_size(module, ignore_cuda=False): 39 | size = 0 40 | for key, parm in module._parameters.items(): 41 | if parm is not None: 42 | if ignore_cuda is True and parm.is_cuda: 43 | continue 44 | size += np.prod(np.array(parm.size())) * 4 45 | for key, buf in module._buffers.items(): 46 | if buf is not None: 47 | if ignore_cuda is True and buf.is_cuda: 48 | continue 49 | size += np.prod(np.array(buf.size())) * 4 50 | 51 | for child in module.children(): 52 | size += get_module_size(child, ignore_cuda) 53 | 54 | return size 55 | --------------------------------------------------------------------------------