├── .gitignore ├── ARCHITECTURE.md ├── DEBUGGING.md ├── INSTALL.md ├── LICENSE ├── PROFILE.md ├── README.md ├── artifact_evaluation ├── README.md ├── example │ ├── README.md │ ├── config.json │ └── resnet50_4_fwd ├── fig10 │ ├── client_0.json │ ├── config_files │ │ ├── bert_mnet.json │ │ ├── bert_rnet.json │ │ ├── ideal │ │ │ ├── mnet_inf.json │ │ │ └── rnet_inf.json │ │ ├── mnet_mnet.json │ │ ├── mnet_rnet.json │ │ ├── mps │ │ │ ├── config.yaml │ │ │ └── run.py │ │ ├── rnet101_mnet.json │ │ ├── rnet101_rnet.json │ │ ├── rnet_mnet.json │ │ ├── rnet_rnet.json │ │ ├── trans_mnet.json │ │ └── trans_rnet.json │ ├── gather_results.py │ ├── inter_arrival_times.json │ ├── plot_latency.py │ ├── prep_dirs.sh │ ├── run_ideal.py │ ├── run_orion.py │ └── run_reef.py └── fig7 │ ├── config_files │ ├── bert_mnet.json │ ├── bert_rnet.json │ ├── ideal │ │ ├── bert_train.json │ │ ├── mnet_inf.json │ │ ├── mnet_train.json │ │ ├── rnet101_train.json │ │ ├── rnet_inf.json │ │ ├── rnet_train.json │ │ └── trans_train.json │ ├── mnet_mnet.json │ ├── mnet_rnet.json │ ├── mps │ │ ├── config.yaml │ │ ├── eval-resnet50train-resnet50-1.log │ │ ├── eval-resnet50train-resnet50-1.log.json │ │ ├── gen_conf_eval-resnet50train-resnet50.yaml │ │ └── run.py │ ├── rnet101_mnet.json │ ├── rnet101_rnet.json │ ├── rnet_mnet.json │ ├── rnet_rnet.json │ ├── trans_mnet.json │ └── trans_rnet.json │ ├── gather_latency.py │ ├── gather_throughput.py │ ├── kernel_files │ ├── mobilenetv2_4_fwd │ ├── mobilenetv2_64_fb0 │ ├── mobilenetv2_64_fb1 │ ├── resnet101_32_fb0 │ ├── resnet101_32_fb1 │ ├── resnet101_4_fwd │ ├── resnet50_32_fb0 │ ├── resnet50_32_fb1 │ └── resnet50_4_fwd │ ├── plot_latency.py │ ├── plot_throughput.py │ ├── prep_dirs.sh │ ├── run_ideal.py │ ├── run_orion.py │ └── run_reef.py ├── benchmarking ├── be.json ├── benchmark_suite │ ├── bert_trainer_mock.py │ ├── bert_trainer_mock_torch.py │ ├── compute_optimal.py │ ├── conv_trainer.py │ ├── examples │ │ ├── basic_config_bert.json │ │ ├── basic_config_transformer.json │ │ └── basic_config_vision.json │ ├── extract_meas.py │ ├── toy_models │ │ ├── bnorm_trainer.py │ │ └── conv_bn_trainer.py │ ├── train_imagenet.py │ ├── train_imagenet_torch.py │ ├── transformer_trainer.py │ ├── transformer_trainer_torch.py │ └── utility_scripts │ │ ├── check_unknown.py │ │ ├── compute_average.py │ │ ├── download_imagenet.sh │ │ └── get_avg.py ├── hp.json ├── launch_jobs.py ├── model_kernels │ ├── bert_2_fwd │ ├── bert_8_fb0 │ ├── bert_8_fb1 │ ├── mobilenetv2_32_fb0 │ ├── mobilenetv2_32_fb1 │ ├── mobilenetv2_4_fwd │ ├── mobilenetv2_64_fb0 │ ├── mobilenetv2_64_fb1 │ ├── mobilenetv2_96_fb0 │ ├── mobilenetv2_96_fb1 │ ├── resnet101_32_fb0 │ ├── resnet101_32_fb1 │ ├── resnet101_4_fwd │ ├── resnet50_32_fb0 │ ├── resnet50_32_fb1 │ ├── resnet50_4_fwd │ ├── transformer_xl_4_fwd │ ├── transformer_xl_8_fb0 │ └── transformer_xl_8_fb1 ├── multi_client_example.json └── scripts │ ├── run.sh │ ├── run_squad_test.py │ └── run_traces.py ├── compile.sh ├── orion_architecture.png ├── profiling ├── benchmarks │ ├── bert.py │ ├── bnorm.py │ ├── conv.py │ ├── conv_bnorm.py │ ├── gnmt.py │ ├── retinanet.py │ ├── transformer.py │ └── vision_models.py └── postprocessing │ ├── generate_file.py │ ├── get_num_blocks.py │ ├── process_ncu.py │ ├── process_nsys.py │ ├── profiles │ ├── bert_2_fwd_new │ ├── efficientnet_4_fwd_new │ ├── mobilenetv2_4_fwd_new │ ├── resnet101_4_fwd_new │ ├── resnet50_32_fb1_new │ ├── resnet50_4_fwd_new │ ├── retinanet_4_fwd_new │ └── transformer_4_fwd_new │ └── roofline_analysis.py ├── related ├── Tick-Tock │ └── test.json └── baselines │ ├── README.md │ ├── bert │ ├── __init__.py │ ├── modeling.py │ ├── optimization.py │ ├── schedulers.py │ ├── squad_example.py │ ├── tokenization.py │ └── train_bert_on_squad.py │ ├── config.yaml │ ├── dcgan │ ├── __init__.py │ ├── dcgan.py │ └── train_dcgan.py │ ├── gnmt │ ├── __init__.py │ ├── seq2seq │ │ ├── data │ │ │ ├── config.py │ │ │ ├── dataset.py │ │ │ ├── sampler.py │ │ │ └── tokenizer.py │ │ ├── gpu_affinity.py │ │ ├── inference │ │ │ ├── beam_search.py │ │ │ ├── tables.py │ │ │ └── translator.py │ │ ├── models │ │ │ ├── attention.py │ │ │ ├── decoder.py │ │ │ ├── encoder.py │ │ │ ├── gnmt.py │ │ │ └── seq2seq_base.py │ │ ├── train │ │ │ ├── fp_optimizers.py │ │ │ ├── lr_scheduler.py │ │ │ ├── smoothing.py │ │ │ ├── table.py │ │ │ └── trainer.py │ │ └── utils.py │ └── train_gnmt.py │ ├── inter_arrival_times.json │ ├── main.py │ ├── nasnet │ ├── __init__.py │ ├── nasnet.py │ ├── nasnet_mobile.py │ └── train_nasnet.py │ ├── requirements.txt │ ├── retinanet │ ├── __init__.py │ ├── coco_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── anchor_utils.py │ │ ├── backbone_utils.py │ │ ├── boxes.py │ │ ├── feature_pyramid_network.py │ │ ├── focal_loss.py │ │ ├── image_list.py │ │ ├── resnet.py │ │ ├── retinanet.py │ │ ├── roi_heads.py │ │ ├── transform.py │ │ └── utils.py │ ├── presets.py │ ├── train_retinanet.py │ └── transforms.py │ ├── run.py │ ├── run_wrapper.sh │ ├── start_MPS_control_daemon.sh │ ├── stop_MPS_control_daemon.sh │ ├── transformer │ ├── __init__.py │ ├── data_utils.py │ ├── lamb.py │ ├── mem_transformer.py │ ├── train_transformer.py │ ├── transformer_consts.yaml │ └── transformer_utils │ │ ├── __init__.py │ │ ├── log_uniform_sampler.py │ │ ├── proj_adaptive_softmax.py │ │ └── vocabulary.py │ ├── utils │ ├── __init__.py │ ├── data_manager.py │ ├── sync_control.py │ └── sync_info.py │ └── vision │ ├── __init__.py │ └── train_imagenet.py ├── setup.py ├── setup ├── Dockerfile ├── README.md ├── install.sh ├── nvidia_deeplearning_changes.patch └── orion-torch-changes.patch └── src ├── cuda_capture ├── Makefile ├── README.md ├── intercept_cublas.cpp ├── intercept_cudnn.cpp ├── intercept_temp.cpp ├── intercept_temp.h └── utils_interc.cpp ├── scheduler ├── Makefile ├── scheduler.h ├── scheduler_eval.cpp ├── utils_sched.cpp └── utils_sched.h ├── scheduler_frontend.py └── system_utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.sqlite 3 | *.qdrep 4 | *.pyc 5 | *.so 6 | *.o 7 | *.ncu-rep 8 | *.svg 9 | *.png 10 | .DS_Store 11 | .idea/ 12 | __pycache__/ 13 | orion.egg-info/ 14 | benchmarking/examples/* 15 | benchmarking/eval/* 16 | benchmarking/results/* 17 | results 18 | -------------------------------------------------------------------------------- /ARCHITECTURE.md: -------------------------------------------------------------------------------- 1 | The Orion system is depicted in the following image: 2 | 3 | ![Orion architecture](orion_architecture.png) 4 | 5 | CUDA/CUDNN/CUBLAS calls are intercepted and submitted into software queues managed by the scheduler. 6 | Each submitted workload is profiled before being run, and the resource profiles of each operator are given as inputs to the scheduler. As depicted in the image, Orion currently supports 1 high-prioriy client, and multiple best-effort clients. 7 | 8 | ### Scheduling Policy 9 | 10 | The scheduler polls for new operations from the clients. If an operator from a high-priority client is found, it is submitted directly in the GPU. 11 | If an operator from a best-effort client is found, Orion submits it based on its resource profile, number of SMs it needs, and the duration of on-the-fly best-effort kernels. 12 | -------------------------------------------------------------------------------- /DEBUGGING.md: -------------------------------------------------------------------------------- 1 | ### For CUDNN debugging: 2 | * export CUDNN_LOGDEST_DBG=stdout 3 | * export CUDNN_LOGINFO_DBG=1 4 | 5 | ### For CUBLAS debugging: 6 | * export CUBLAS_LOGDEST_DBG=stdout 7 | * export CUBLAS_LOGINFO_DBG=1 8 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ### Use Docker image 2 | 3 | We have set up a docker image: [fotstrt/orion-ae](https://hub.docker.com/repository/docker/fotstrt/orion-ae/general) with all packages pre-installed. We assume NVIDIA drivers are installed in the source machine, and that docker containers can use the host machine's GPUs. 4 | 5 | * Start a container with `docker run --gpus=1 -it fotstrt/orion-ae:v1 bash` 6 | * Download the Orion repo and install: 7 | * `git clone https://github.com/eth-easl/orion.git` 8 | * `cd orion` 9 | * `bash compile.sh` 10 | * `pip install -e .` 11 | 12 | 13 | ### Without Docker image 14 | 15 | In order to use Orion without our pre-built image, a user must install: 16 | * [NVIDIA CUDA](https://developer.nvidia.com/cuda-toolkit). We have tested Orion with CUDA 10.2 and CUDA 11.3 17 | * (optionally) [NVIDIA CUDNN](https://developer.nvidia.com/cudnn) 18 | * Pytorch (from source) + TorchVision 19 | * Download the Orion repo and install: 20 | * `git clone https://github.com/eth-easl/orion.git` 21 | * `cd orion` 22 | * `bash compile.sh` 23 | * `pip install -e .` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 - present | ETH Zurich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PROFILE.md: -------------------------------------------------------------------------------- 1 | ## Instructions on kernel-level analysis with NVIDIA Nsight and PyTorch 2 | 3 | ### Notes: 4 | 1. the locations of nsys and nsight-cu-cli may vary from this guide 5 | 2. This guide assumes the user has setup a `script.py` to profile 6 | 7 | ### Profiling 8 | 1. Setup Torch-addons for NCU: Use `torch.cuda.nvtx.range_push("start")` and `torch.cuda.nvtx.range_pop()` around the region to profile. 9 | 2. Setup Torch-addons for NSYS: Use `torch.cuda.profiler.cudart().cudaProfilerStart()` and `torch.cuda.profiler.cudart().cudaProfilerStop()` around the region to profile. 10 | 3. Allow for NSYS profiling: `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'` 11 | 4. Profile with NCU: `sudo /opt/nvidia/nsight-compute/2021.2.0/nv-nsight-cu-cli -o output_ncu --set detailed --nvtx --nvtx-include "start/" python3 script.py` 12 | 5. Profile with NCU in CSV: `sudo /opt/nvidia/nsight-compute/2021.2.0/nv-nsight-cu-cli --csv --set detailed --nvtx --nvtx-include "start/" python3 script.py > output_ncu.csv` 13 | 6. Profile with NSYS: `nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s none -o output_nsys --cudabacktrace=true --capture-range=cudaProfilerApi --stop-on-range-end=true -f true -x true python3 script.py` 14 | 7. Convert NSYS output to CSV: `nsys stats --report gputrace --format csv,column --output .,- output_nsys.qdrep` 15 | 16 | At this point, 4 files should have been generated: 17 | * `output_ncu.ncu-rep` 18 | * `output_ncu.csv` 19 | * `output_nsys.qdrep` 20 | * `output_nsys_gputrace.csv` 21 | 22 | Using Nsight Compute, open the `output_ncu.ncu-rep` file, and download the raw csv file as `raw_ncu.csv`. 23 | 24 | 25 | ### Extracting resource utilization info 26 | Extract the required information from the profiling files: 27 | * `python profiling/postprocessing/process_ncu.py --results_dir ` 28 | 29 | If the `output_ncu.csv` file contains any program logs that do not conform with the `.csv` format, this command might throw errors. 30 | 31 | Make sure the file is in a correct `.csv` format: depending on the NVIDIA CUDA version, and the type of profiling, the first line should look like that: 32 | 33 | `"ID","Process ID","Process Name","Host Name","thread Domain:Push/Pop_Range:PL_Type:PL_Value:CLR_Type:Color:Msg_Type:Msg","Id:Domain:Start/Stop_Range:PL_Type:PL_Value:CLR_Type:Color:Msg_Type:Msg","Kernel Name","Kernel Time","Context","Stream","Section Name","Metric Name","Metric Unit","Metric Value","Rule Name","Rule Type","Rule Description"` 34 | 35 | 36 | * `python profiling/postprocessing/get_num_blocks.py --results_dir --max_threads_sm --max_blocks_sm --max_shmem_sm --max_regs_sm ` 37 | 38 | You can find the maximum number of threads, blocks, shared memory and registers per SM in the GPU's architecture description. 39 | By default, the `get_num_blocks.py` is configured for the [NVIDIA Tesla V100 GPU](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf). 40 | 41 | * `python profiling/postprocessing/roofline_analysis.py --results_dir --ai_threshold ` 42 | 43 | Note that `ai_threshold` stands for the 'knee' arithmetic intensity of the roofline plot taken from the Nsight Compute tool, and might be different for each GPU. 44 | 45 | After these steps, an `output_ncu_sms_roofline.csv` should have been generated. 46 | 47 | ### (Optional) Plot traces 48 | You can use the `profiling/postprocessing/process_nsys.py` file to generate resource utilization plot traces over time. 49 | * `python profiling/postprocessing/process_nsys.py --results_dir --max_sms --metric ` 50 | 51 | ### Postprocessing to convert to a kernel info file for Orion to use 52 | This reads the profiling file and keeps the necessary information needed for each kernel (Number of SMs, Profile, Duration). 53 | It also groups kernels into operators, e.g. if a CUDNN Convolution operator has 2 kernels, it will group them into one operator. 54 | * `python profiling/postprocessing/generate_file.py --input_file_name --output_file_name --model_type ` 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Orion 2 | 3 | Orion is a fine-grained scheduler for interference-free GPU sharing across ML workloads. It is based on our EuroSys'24 paper "Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications". 4 | 5 | ## Table of Contents 6 | - [Introduction](#introduction) 7 | - [Example](#example) 8 | - [Project Structure](#project-structure) 9 | - [Hardware Requirement](#hardware-requirement) 10 | - [Hardware Configuration used in the paper](#hardware-configuration-used-in-the-paper) 11 | - [Installation](#installation) 12 | - [Debugging](#debugging) 13 | - [Paper](#paper) 14 | 15 | ## Introduction 16 | 17 | Orion is a fine-grained, interference-free scheduler for GPU sharing across ML workloads. We assume one of the clients is high-priority, while the rest of the clients are best-effort. 18 | 19 | Orion intercepts CUDA, CUDNN, and CUBLAS calls and submits them into software queues. 20 | The _Scheduler_ polls these queues and schedules operations based on their resource requirements and their priority. See [ARCHITECTURE](ARCHITECTURE.md) for more details on the system and the scheduling policy. 21 | 22 | Orion expects that each submitted job has a file where all of its operations, along with their profiles and Straming Multiprocessor (SM) requirements are listed. See [PROFILE](PROFILE.md) for detailed instructions on how to profile a client applications, and how to generate the profile files. 23 | 24 | ## Example 25 | 26 | We have set up a docker image: [fotstrt/orion-ae](https://hub.docker.com/repository/docker/fotstrt/orion-ae/general) with all packages pre-installed. 27 | Alternatively, follow the instructions on the 'setup' directory, and check [INSTALL](INSTALL.md), to install Orion and its dependencies. 28 | 29 | See [PROFILE](PROFILE.md) to generate profiling files for each workload. 30 | Create a json file containing all the info for the workloads that are about to share the GPU. See examples under 'artifact_evaluation/example'. 31 | 32 | The file 'launch_jobs.py' is responsible for spawning the scheduler and the application thread(s). 33 | 34 | ## Project Structure 35 | ``` 36 | > tree . 37 | ├── profiling # Scripts and instructions for profiling 38 | │ ├── benchmarks # Scripts of DNN models for profiling 39 | │ ├── postprocessing # Scripts for processing of profile files 40 | └── src # Source code 41 | │ ├── cuda_capture # Code to intercept CUDA/CUDNN/CUBLAS calls 42 | │ └── scheduler # Implementation of the scheduling policy 43 | │ └── scheduler_frontend.py # Python interface for the Orion scheduler 44 | └── benchmarking # Scripts and configuration files for benchmarking 45 | | ├── benchmark_suite # Training and inference scripts 46 | | ├── model_kernels # Files containing profile information for the submitted models 47 | └── related # Some of the related baselines: MPS, Streams, Tick-Tock 48 | └── artifact_evaluation # Scripts and instructions for artifact evaluation 49 | | ├── example # Basic example to test Orion functionality 50 | | ├── fig7 # Scripts to reproduce Figure 7 of the paper 51 | | ├── fig10 # Scripts to reproduce Figure 10 of the paper 52 | └── setup # Instructions and scripts to install Orion's prerequisites. 53 | ``` 54 | 55 | ## Hardware Requirements 56 | Orion currently supports NVIDIA GPUs. 57 | 58 | ## Hardware Configuration used in the paper 59 | For the experiments presented in the paper, we evaluated Orion in Google Cloud Platform VMs with the following configurations: 60 | * n1-standard-8 VM (8 vCPUs, 30GB of DRAM) with an V100-16GB GPU, with CUDA 10.2 61 | * a2-highgpu-1g VM (12 vCPUs, 85GB of DRAM) with an A100-40GB GPU, with CUDA 11.3 62 | 63 | In both cases, the machines have Ubuntu 18.04. 64 | 65 | ## Installation 66 | see [INSTALL](INSTALL.md). 67 | 68 | ## Debugging 69 | see [DEBUGGING](DEBUGGING.md). 70 | 71 | ## Paper 72 | If you use Orion, please cite our paper: 73 | ```bibtex 74 | @inproceedings{eurosys24orion, 75 | author = {Strati, Foteini and Ma, Xianzhe and Klimovic, Ana}, 76 | title = {Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications}, 77 | year = {2024}, 78 | isbn = {9798400704376}, 79 | publisher = {Association for Computing Machinery}, 80 | address = {New York, NY, USA}, 81 | url = {https://doi.org/10.1145/3627703.3629578}, 82 | doi = {10.1145/3627703.3629578}, 83 | booktitle = {Proceedings of the Nineteenth European Conference on Computer Systems}, 84 | pages = {1075–1092}, 85 | numpages = {18}, 86 | keywords = {GPUs, Machine Learning}, 87 | location = {Athens, Greece}, 88 | series = {EuroSys '24} 89 | } 90 | ``` 91 | -------------------------------------------------------------------------------- /artifact_evaluation/example/README.md: -------------------------------------------------------------------------------- 1 | This is a simple example to check that Orion has been installed correctly and can run. 2 | 3 | Please follow the instructions in [INSTALL](INSTALL.md) to start a container with our image. 4 | Then start the Orion process (server and client) by running: 5 | * `cd /root/orion/benchmarking` 6 | * `LD_PRELOAD="/root/orion/src/cuda_capture/libinttemp.so" python launch_jobs.py /root/orion/artifact_evaluation/example/config.json 1 1 1` -------------------------------------------------------------------------------- /artifact_evaluation/example/config.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 2000, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "rps": 30, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | } 16 | ] 17 | -------------------------------------------------------------------------------- /artifact_evaluation/fig10/client_0.json: -------------------------------------------------------------------------------- 1 | {"p50_latency": 19.11342144012451, "p95_latency": 25.438904762268066, "p99_latency": 104.0643930435141, "throughput": 19.992602003050518} -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/bert_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "bert", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/bert_2_fwd", 5 | "num_kernels": 572, 6 | "num_iters": 2500000, 7 | "args": { 8 | "batchsize": 2, 9 | "rps": 8, 10 | "uniform": true, 11 | "dummy_data": true, 12 | "train": false 13 | } 14 | }, 15 | { 16 | "arch": "mobilenet_v2", 17 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 18 | "num_kernels": 152, 19 | "num_iters": 6240, 20 | "args": { 21 | "model_name": "mobilenet_v2", 22 | "batchsize": 4, 23 | "rps": 0, 24 | "uniform": true, 25 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 26 | "dummy_data": true, 27 | "train": false 28 | } 29 | } 30 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/bert_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "bert", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/bert_2_fwd", 5 | "num_kernels": 572, 6 | "num_iters": 2500000, 7 | "args": { 8 | "batchsize": 2, 9 | "rps": 8, 10 | "uniform": true, 11 | "dummy_data": true, 12 | "train": false 13 | } 14 | }, 15 | { 16 | "arch": "resnet50", 17 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 18 | "num_kernels": 175, 19 | "num_iters": 6240, 20 | "args": { 21 | "model_name": "resnet50", 22 | "batchsize": 4, 23 | "rps": 0, 24 | "uniform": true, 25 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 26 | "dummy_data": true, 27 | "train": false 28 | } 29 | } 30 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/ideal/mnet_inf.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 5 | "num_kernels": 152, 6 | "num_iters": 6240, 7 | "args": { 8 | "model_name": "mobilenet_v2", 9 | "batchsize": 4, 10 | "rps": 0, 11 | "uniform": false, 12 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 13 | "dummy_data": true, 14 | "train": false 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/ideal/rnet_inf.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 6240, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "dummy_data": true, 11 | "rps": 0, 12 | "uniform": false, 13 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 14 | "train": false 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/mnet_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 5 | "num_kernels": 152, 6 | "num_iters": 2500000, 7 | "args": { 8 | "model_name": "mobilenet_v2", 9 | "batchsize": 4, 10 | "rps": 100, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "mobilenet_v2", 18 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 19 | "num_kernels": 152, 20 | "num_iters": 6240, 21 | "args": { 22 | "model_name": "mobilenet_v2", 23 | "batchsize": 4, 24 | "rps": 0, 25 | "uniform": true, 26 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/mnet_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 5 | "num_kernels": 152, 6 | "num_iters": 2500000, 7 | "args": { 8 | "model_name": "mobilenet_v2", 9 | "batchsize": 4, 10 | "rps": 100, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "resnet50", 18 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 19 | "num_kernels": 175, 20 | "num_iters": 6240, 21 | "args": { 22 | "model_name": "resnet50", 23 | "batchsize": 4, 24 | "rps": 0, 25 | "uniform": true, 26 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/mps/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | policy: "MPS" # "MPS", "TickTock", "Streams", "Isolated", or "Sequential" 3 | models: 4 | model0: 5 | mode: eval # train or eval 6 | name: bert # these two names should strictly correspond to the model names below 7 | model1: 8 | mode: eval # train or eval 9 | name: mobilenet_v2 10 | shared_config: 11 | distribution: trace # poisson, uniform, or trace 12 | trace_path: '../../inter_arrival_times.json' # only used when distribution is trace 13 | pin_memory: true 14 | seed: 42 15 | 16 | # configuration for each model 17 | resnet50: 18 | arch: resnet50 19 | batch_size: 4 20 | num_iterations: 1000000 21 | request_rate: 80 # measured in 1/seconds. If 0 it means no sleep 22 | resnet101: 23 | arch: resnet101 24 | batch_size: 4 25 | num_iterations: 1000000 26 | request_rate: 40 # measured in 1/seconds. If 0 it means no sleep 27 | mobilenet_v2: 28 | arch: mobilenet_v2 29 | batch_size: 4 30 | num_iterations: 1000000 31 | request_rate: 100 # measured in 1/seconds. If 0 it means no sleep 32 | bert: 33 | batch_size: 2 34 | arch: large # either base or large 35 | num_iterations: 1000000 36 | request_rate: 8 # measured in 1/seconds. If 0 it means no sleep 37 | # large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16' 38 | # base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12' 39 | transformer: 40 | arch: base # either base or large 41 | batch_size: 4 42 | num_iterations: 1000000 43 | request_rate: 20 # measured in 1/seconds. If 0 it means no sleep -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/mps/run.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import itertools 3 | import logging 4 | import os 5 | 6 | mnames = { 7 | 'resnet50': "ResNet50", 8 | 'mobilenet_v2': "MobileNetV2", 9 | 'resnet101': 'ResNet101', 10 | 'bert': 'BERT', 11 | 'transformer': 'Transformer' 12 | } 13 | 14 | def run(model0, model1, config, combination_name, times=1, start_id = 0): 15 | 16 | config_file_name = f'gen_conf_{combination_name}.yaml' 17 | 18 | logging.info(f'dump config to {config_file_name}') 19 | with open(f'./{config_file_name}', 'w') as file: 20 | yaml.dump(config, file) 21 | # run python main.py 22 | logging.info(f'training with this config {times} times') 23 | 24 | 25 | for i in range(start_id, start_id + times): 26 | log_file = f'log_{i}_{combination_name}.log' 27 | os.system(f"python3.8 {os.path.expanduser( '~' )}/orion/related/baselines/main.py --config ./{config_file_name}") 28 | print(f"{combination_name}.log.json") 29 | os.system(f"cp {combination_name}.log.json ../../results/mps/{mnames[model0]}_{mnames[model1]}_{i}.json") 30 | 31 | 32 | 33 | if __name__ == "__main__": 34 | logging.basicConfig( 35 | level=logging.INFO, 36 | format='%(asctime)s %(levelname)-8s: [%(filename)s:%(lineno)d] %(message)s', 37 | datefmt='%d/%m/%Y %H:%M:%S', 38 | handlers=[ 39 | # output to console 40 | logging.StreamHandler(), 41 | ] 42 | ) 43 | with open('./config.yaml', 'r') as file: 44 | default_full_config = yaml.load(file, Loader=yaml.FullLoader) 45 | 46 | # ----configuration region started---- 47 | model0_mode = 'eval' 48 | model1_mode = 'eval' 49 | 50 | policy = 'MPS' 51 | 52 | train_batch_sizes = { 53 | 'resnet50': 32, 54 | 'mobilenet_v2': 64, 55 | 'resnet101': 32, 56 | 'bert': 8, 57 | 'transformer': 8 58 | } 59 | 60 | eval_batch_sizes = { 61 | 'resnet50': 4, 62 | 'mobilenet_v2': 4, 63 | 'resnet101': 4, 64 | 'bert': 2, 65 | 'transformer': 4 66 | } 67 | 68 | 69 | models = ['resnet50', 'mobilenet_v2', 'resnet101', 'bert', 'transformer'] 70 | combinations = itertools.product(models[:2], models) 71 | times = 3 72 | start_id = 0 73 | distribution = 'trace' 74 | 75 | 76 | # ----configuration region ended---- 77 | 78 | default_full_config['shared_config']['distribution'] = distribution 79 | 80 | for model0, model1 in combinations: 81 | default_full_config['models']['model0']['name'] = model0 82 | default_full_config['models']['model0']['mode'] = model0_mode 83 | default_full_config['models']['model1']['name'] = model1 84 | default_full_config['models']['model1']['mode'] = model1_mode 85 | default_full_config['policy'] = policy 86 | 87 | combination_name = f'{model0_mode}-{model0}{model1_mode}-{model1}' 88 | run(model0, model1, default_full_config, combination_name, times=times, start_id=start_id) -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/rnet101_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet101", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_4_fwd", 5 | "num_kernels": 345, 6 | "num_iters": 2500000, 7 | "args": { 8 | "model_name": "resnet101", 9 | "batchsize": 4, 10 | "rps": 40, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "mobilenet_v2", 18 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 19 | "num_kernels": 152, 20 | "num_iters": 6240, 21 | "args": { 22 | "model_name": "mobilenet_v2", 23 | "batchsize": 4, 24 | "rps": 0, 25 | "uniform": true, 26 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/rnet101_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet101", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_4_fwd", 5 | "num_kernels": 345, 6 | "num_iters": 2500000, 7 | "args": { 8 | "model_name": "resnet101", 9 | "batchsize": 4, 10 | "rps": 40, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "resnet50", 18 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 19 | "num_kernels": 175, 20 | "num_iters": 6240, 21 | "args": { 22 | "model_name": "resnet50", 23 | "batchsize": 4, 24 | "rps": 0, 25 | "uniform": true, 26 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/rnet_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 2500000, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "rps": 80, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "mobilenet_v2", 18 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 19 | "num_kernels": 152, 20 | "num_iters": 6240, 21 | "args": { 22 | "model_name": "mobilenet_v2", 23 | "batchsize": 4, 24 | "rps": 0, 25 | "uniform": true, 26 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/rnet_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 2500000, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "rps": 80, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "resnet50", 18 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 19 | "num_kernels": 175, 20 | "num_iters": 6240, 21 | "args": { 22 | "model_name": "resnet50", 23 | "batchsize": 4, 24 | "rps": 0, 25 | "uniform": true, 26 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/trans_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "transformer", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_4_fwd", 5 | "num_kernels": 459, 6 | "num_iters": 2500000, 7 | "args": { 8 | "batchsize": 4, 9 | "rps": 20, 10 | "uniform": true, 11 | "dummy_data": true, 12 | "train": false 13 | } 14 | }, 15 | { 16 | "arch": "mobilenet_v2", 17 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 18 | "num_kernels": 152, 19 | "num_iters": 6240, 20 | "args": { 21 | "model_name": "mobilenet_v2", 22 | "batchsize": 4, 23 | "rps": 0, 24 | "uniform": true, 25 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 26 | "dummy_data": true, 27 | "train": false 28 | } 29 | } 30 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/config_files/trans_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "transformer", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_4_fwd", 5 | "num_kernels": 459, 6 | "num_iters": 2500000, 7 | "args": { 8 | "batchsize": 4, 9 | "rps": 20, 10 | "uniform": true, 11 | "dummy_data": true, 12 | "train": false 13 | } 14 | }, 15 | { 16 | "arch": "resnet50", 17 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 18 | "num_kernels": 175, 19 | "num_iters": 6240, 20 | "args": { 21 | "model_name": "resnet50", 22 | "batchsize": 4, 23 | "rps": 0, 24 | "uniform": false, 25 | "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json", 26 | "dummy_data": true, 27 | "train": false 28 | } 29 | } 30 | 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig10/gather_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import json 5 | import itertools 6 | 7 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 8 | baselines = ['reef', 'orion', 'mps', 'ideal'] 9 | 10 | hp_list = ['ResNet50', 'MobileNetV2'] 11 | be_list = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 12 | num_runs = 3 13 | 14 | df_ideal = pd.DataFrame("0", index=models, columns=models) 15 | for hp in hp_list: 16 | results = [] 17 | for run in range(num_runs): 18 | input_file = f"results/ideal/{hp}_{run}_hp.json" 19 | with open(input_file, 'r') as f: 20 | data = json.load(f) 21 | results.append(float(data['p95_latency'])) 22 | 23 | for be in be_list: 24 | df_ideal.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}" 25 | df_ideal.to_csv(f'results/ideal_latency.csv') 26 | print("ideal") 27 | print(df_ideal) 28 | 29 | # mps 30 | df_mps = pd.DataFrame(0.0, index=models, columns=models) 31 | for hp in hp_list: 32 | for be,hp in itertools.product(be_list, hp_list): 33 | results = [] 34 | for run in range(num_runs): 35 | input_file = f"results/mps/{hp}_{be}_{run}.json" 36 | with open(input_file, 'r') as f: 37 | data = json.load(f) 38 | results.append(float(data['p95-latency-0'])) 39 | df_mps.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}" 40 | df_mps.to_csv(f'results/mps_latency.csv') 41 | print("mps") 42 | print(df_mps) 43 | 44 | for baseline in baselines[:-2]: 45 | df = pd.DataFrame("0", index=models, columns=models) 46 | for be,hp in itertools.product(be_list, hp_list): 47 | results = [] 48 | for run in range(num_runs): 49 | input_file = f"results/{baseline}/{be}_{hp}_{run}_hp.json" 50 | with open(input_file, 'r') as f: 51 | data = json.load(f) 52 | results.append(float(data['p95_latency'])) 53 | df.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}" 54 | df.to_csv(f'results/{baseline}_latency.csv') 55 | print(baseline) 56 | print(df) 57 | -------------------------------------------------------------------------------- /artifact_evaluation/fig10/plot_latency.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 6 | 7 | # %% 8 | 9 | def get_data(csv_file, error=False): 10 | df = pd.read_csv(csv_file) 11 | df = df.drop(df.columns[0], axis=1) 12 | df.index = models 13 | 14 | df = df.drop(df.columns[-3], axis=1) 15 | df = df.drop(df.columns[-2], axis=1) 16 | df = df.drop(df.columns[-1], axis=1) 17 | 18 | for model_row in models: 19 | for model_col in models[:2]: 20 | cell = df.at[model_row, model_col] 21 | df.at[model_row, model_col] = float(cell.split('/')[0]) #float(cell.split('/')[1]) if error else float(cell.split('/')[0]) 22 | if error: 23 | return df.std() 24 | else: 25 | return df.mean() 26 | 27 | # %% 28 | method2file = { 29 | 'MPS': 'results/mps_latency.csv', 30 | 'REEF policy': 'results/reef_latency.csv', 31 | 'Orion': 'results/orion_latency.csv', 32 | 'Ideal': 'results/ideal_latency.csv' 33 | } 34 | 35 | label_font_size = 22 36 | methods = list(method2file.keys()) 37 | 38 | method2data = {} 39 | method2err = {} 40 | 41 | for method, file in method2file.items(): 42 | method2data[method] = get_data(file) 43 | method2err[method] = get_data(file, error=True) 44 | 45 | width = 0.15 46 | fig, ax = plt.subplots(figsize=(14, 8)) 47 | x = np.arange(2) 48 | bars = [] 49 | for method_id, method in enumerate(methods): 50 | 51 | bar = ax.bar( 52 | x + width * method_id, method2data[method], width, 53 | label=method, yerr=method2err[method], 54 | align='edge' 55 | ) 56 | bars.append(bar) 57 | 58 | x_tick_positions = x + width * len(methods) / 2 59 | ax.set_xticks( 60 | ticks=x_tick_positions, 61 | labels=models[:2], fontsize=22 62 | ) 63 | plt.yticks(fontsize=22) 64 | ax.set_ylabel('Average p95 inference latency (ms)', fontsize=label_font_size) 65 | ax.set_xlabel('High-priority inference job', fontsize=label_font_size) 66 | 67 | plt.tight_layout() 68 | handles, labels = ax.get_legend_handles_labels() 69 | plt.legend(handles, labels, loc='upper left', ncol=1, fontsize=20) 70 | 71 | plt.savefig("fig10.png", bbox_inches="tight") 72 | -------------------------------------------------------------------------------- /artifact_evaluation/fig10/prep_dirs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir results 4 | mkdir results/ideal 5 | mkdir results/reef 6 | mkdir results/orion 7 | mkdir results/mps -------------------------------------------------------------------------------- /artifact_evaluation/fig10/run_ideal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | num_runs = 3 5 | trace_files_hp = [ 6 | ("ResNet50", "rnet"), 7 | ("MobileNetV2", "mnet"), 8 | ] 9 | 10 | for (model, f) in trace_files_hp: 11 | for run in range(num_runs): 12 | print(model, run, flush=True) 13 | # run 14 | file_path = f"config_files/ideal/{f}_inf.json" 15 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path}") 16 | 17 | # copy results 18 | os.system(f"cp client_0.json results/ideal/{model}_{run}_hp.json") 19 | os.system("rm client_0.json") -------------------------------------------------------------------------------- /artifact_evaluation/fig10/run_orion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | num_runs = 3 5 | trace_files = [ 6 | ("ResNet50", "ResNet50", "rnet_rnet", 160000), 7 | ("ResNet50", "MobileNetV2", "rnet_mnet", 100000), 8 | ("MobileNetV2", "ResNet50", "mnet_rnet", 160000), 9 | ("MobileNetV2", "MobileNetV2", "mnet_mnet", 100000), 10 | ("ResNet101", "ResNet50", "rnet101_rnet", 160000), 11 | ("ResNet101", "MobileNetV2", "rnet101_mnet", 100000), 12 | ("BERT", "ResNet50", "bert_rnet", 160000), 13 | ("BERT", "MobileNetV2", "bert_mnet", 100000), 14 | ("Transformer", "ResNet50", "trans_rnet", 160000), 15 | ("Transformer", "MobileNetV2", "trans_mnet", 100000), 16 | ] 17 | 18 | for (be, hp, f, max_be_duration) in trace_files: 19 | for run in range(num_runs): 20 | print(be, hp, run, flush=True) 21 | # run 22 | file_path = f"config_files/{f}.json" 23 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path} --orion_max_be_duration {max_be_duration}") 24 | 25 | # copy results 26 | os.system(f"cp client_1.json results/orion/{be}_{hp}_{run}_hp.json") 27 | os.system("rm -rf client_1.json") 28 | -------------------------------------------------------------------------------- /artifact_evaluation/fig10/run_reef.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | num_runs = 3 5 | trace_files = [ 6 | ("ResNet50", "ResNet50", "rnet_rnet"), 7 | ("ResNet50", "MobileNetV2", "rnet_mnet"), 8 | ("MobileNetV2", "ResNet50", "mnet_rnet"), 9 | ("MobileNetV2", "MobileNetV2", "mnet_mnet"), 10 | ("ResNet101", "ResNet50", "rnet101_rnet"), 11 | ("ResNet101", "MobileNetV2", "rnet101_mnet"), 12 | ("BERT", "ResNet50", "bert_rnet"), 13 | ("BERT", "MobileNetV2", "bert_mnet"), 14 | ("Transformer", "ResNet50", "trans_rnet"), 15 | ("Transformer", "MobileNetV2", "trans_mnet"), 16 | ] 17 | 18 | for (be, hp, f) in trace_files: 19 | for run in range(num_runs): 20 | print(be, hp, run, flush=True) 21 | # run 22 | file_path = f"config_files/{f}.json" 23 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo reef --config_file {file_path} --reef_depth 12") 24 | 25 | # copy results 26 | os.system(f"cp client_1.json results/reef/{be}_{hp}_{run}_hp.json") 27 | os.system("rm client_1.json") 28 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/bert_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "bert", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1", 6 | "num_kernels": 4777, 7 | "additional_num_kernels": 4777, 8 | "num_iters": 1200000, 9 | "args": { 10 | "batchsize": 8, 11 | "rps": 0, 12 | "uniform": false, 13 | "dummy_data": false, 14 | "train": true 15 | } 16 | }, 17 | { 18 | "arch": "mobilenet_v2", 19 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 20 | "num_kernels": 152, 21 | "num_iters": 12000, 22 | "args": { 23 | "model_name": "mobilenet_v2", 24 | "batchsize": 4, 25 | "rps": 40, 26 | "uniform": false, 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/bert_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "bert", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1", 6 | "num_kernels": 4777, 7 | "additional_num_kernels": 4777, 8 | "num_iters": 550000, 9 | "args": { 10 | "batchsize": 8, 11 | "rps": 0, 12 | "uniform": false, 13 | "dummy_data": false, 14 | "train": true 15 | } 16 | }, 17 | { 18 | "arch": "resnet50", 19 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 20 | "num_kernels": 175, 21 | "num_iters": 9200, 22 | "args": { 23 | "model_name": "resnet50", 24 | "batchsize": 4, 25 | "rps": 15, 26 | "uniform": false, 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/bert_train.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "bert", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1", 6 | "num_kernels": 4777, 7 | "additional_num_kernels": 4777, 8 | "num_iters": 200, 9 | "args": { 10 | "batchsize": 8, 11 | "rps": 0, 12 | "uniform": false, 13 | "dummy_data": false, 14 | "train": true 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/mnet_inf.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 5 | "num_kernels": 152, 6 | "num_iters": 12000, 7 | "args": { 8 | "model_name": "mobilenet_v2", 9 | "batchsize": 4, 10 | "rps": 40, 11 | "uniform": false, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | } 16 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/mnet_train.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb1", 6 | "num_kernels": 574, 7 | "additional_num_kernels": 890, 8 | "num_iters": 200, 9 | "args": { 10 | "model_name": "mobilenet_v2", 11 | "batchsize": 64, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/rnet101_train.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet101", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb1", 6 | "num_kernels": 1219, 7 | "additional_num_kernels": 1847, 8 | "num_iters": 200, 9 | "args": { 10 | "model_name": "resnet101", 11 | "batchsize": 32, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/rnet_inf.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 9200, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "rps": 15, 11 | "uniform": false, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | } 16 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/rnet_train.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb1", 6 | "num_kernels": 624, 7 | "additional_num_kernels": 946, 8 | "num_iters": 200, 9 | "args": { 10 | "model_name": "resnet50", 11 | "batchsize": 32, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/ideal/trans_train.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "transformer", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1", 6 | "num_kernels": 4396, 7 | "additional_num_kernels": 4354, 8 | "num_iters": 200, 9 | "args": { 10 | "batchsize": 8, 11 | "rps": 0, 12 | "uniform": false, 13 | "dummy_data": true, 14 | "train": true 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/mnet_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb1", 6 | "num_kernels": 574, 7 | "additional_num_kernels": 890, 8 | "num_iters": 1200000, 9 | "args": { 10 | "model_name": "mobilenet_v2", 11 | "batchsize": 64, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | }, 18 | { 19 | "arch": "mobilenet_v2", 20 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 21 | "num_kernels": 152, 22 | "num_iters": 12000, 23 | "args": { 24 | "model_name": "mobilenet_v2", 25 | "batchsize": 4, 26 | "rps": 40, 27 | "uniform": false, 28 | "dummy_data": true, 29 | "train": false 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/mnet_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "mobilenet_v2", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb1", 6 | "num_kernels": 574, 7 | "additional_num_kernels": 890, 8 | "num_iters": 920000, 9 | "args": { 10 | "model_name": "mobilenet_v2", 11 | "batchsize": 64, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | }, 18 | { 19 | "arch": "resnet50", 20 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 21 | "num_kernels": 175, 22 | "num_iters": 9200, 23 | "args": { 24 | "model_name": "resnet50", 25 | "batchsize": 4, 26 | "rps": 15, 27 | "uniform": false, 28 | "dummy_data": true, 29 | "train": false 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/mps/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | policy: "MPS" # "MPS", "TickTock", "Streams", "Isolated", or "Sequential" 3 | models: 4 | model0: 5 | mode: eval # train or eval 6 | name: bert # these two names should strictly correspond to the model names below 7 | model1: 8 | mode: eval # train or eval 9 | name: mobilenet_v2 10 | shared_config: 11 | distribution: poisson # poisson, uniform, or trace 12 | trace_path: './inter_arrival_times.json' # only used when distribution is trace 13 | pin_memory: true 14 | seed: 42 15 | 16 | # configuration for each model 17 | resnet50: 18 | arch: resnet50 19 | batch_size: 4 20 | num_iterations: 1000000 21 | request_rate: 15 # measured in 1/seconds. If 0 it means no sleep 22 | resnet101: 23 | arch: resnet101 24 | batch_size: 32 25 | num_iterations: 1000000 26 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 27 | mobilenet_v2: 28 | arch: mobilenet_v2 29 | batch_size: 4 30 | num_iterations: 1000000 31 | request_rate: 40 # measured in 1/seconds. If 0 it means no sleep 32 | bert: 33 | batch_size: 8 34 | arch: base # either base or large 35 | num_iterations: 1000000 36 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 37 | # large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16' 38 | # base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12' 39 | transformer: 40 | arch: base # either base or large 41 | batch_size: 8 42 | num_iterations: 1000000 43 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 44 | 45 | resnet50-1: 46 | arch: resnet50 47 | batch_size: 32 48 | num_iterations: 1000000 49 | request_rate: 80 # measured in 1/seconds. If 0 it means no sleep 50 | mobilenet_v2-1: 51 | arch: mobilenet_v2 52 | batch_size: 64 53 | num_iterations: 1000000 54 | request_rate: 100 # measured in 1/seconds. If 0 it means no sleep 55 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/mps/eval-resnet50train-resnet50-1.log.json: -------------------------------------------------------------------------------- 1 | { 2 | "duration-1": 2.927621603012085, 3 | "iterations-1": 38, 4 | "throughput-1": 9.222503335889117, 5 | "duration": 2.9278297424316406, 6 | "p50-latency-0": 64.3700361251831, 7 | "throughput-0": 32.203886399001064, 8 | "p90-latency-0": 142.8278207778931, 9 | "p95-latency-0": 156.5918684005737, 10 | "p99-latency-0": 181.01235866546628 11 | } -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/mps/gen_conf_eval-resnet50train-resnet50.yaml: -------------------------------------------------------------------------------- 1 | bert: 2 | arch: base 3 | batch_size: 8 4 | num_iterations: 10000 5 | request_rate: 0 6 | mobilenet_v2: 7 | arch: mobilenet_v2 8 | batch_size: 4 9 | num_iterations: 10000 10 | request_rate: 40 11 | mobilenet_v2-1: 12 | arch: mobilenet_v2 13 | batch_size: 64 14 | num_iterations: 10000 15 | request_rate: 100 16 | models: 17 | model0: 18 | mode: eval 19 | name: resnet50 20 | model1: 21 | mode: train 22 | name: resnet50-1 23 | policy: MPS 24 | resnet101: 25 | arch: resnet101 26 | batch_size: 32 27 | num_iterations: 10000 28 | request_rate: 0 29 | resnet50: 30 | arch: resnet50 31 | batch_size: 4 32 | num_iterations: 100 33 | request_rate: 30 34 | resnet50-1: 35 | arch: resnet50 36 | batch_size: 32 37 | num_iterations: 10000 38 | request_rate: 80 39 | shared_config: 40 | distribution: poisson 41 | pin_memory: true 42 | seed: 42 43 | trace_path: ./inter_arrival_times.json 44 | transformer: 45 | arch: base 46 | batch_size: 8 47 | num_iterations: 10000 48 | request_rate: 0 49 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/mps/run.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import itertools 3 | import logging 4 | import os 5 | 6 | mnames = { 7 | 'resnet50': "ResNet50", 8 | 'mobilenet_v2': "MobileNetV2", 9 | 'resnet101': 'ResNet101', 10 | 'bert': 'BERT', 11 | 'transformer': 'Transformer' 12 | } 13 | 14 | def run(model0, model1, config, combination_name, times=1, start_id = 0): 15 | 16 | config_file_name = f'gen_conf_{combination_name}.yaml' 17 | 18 | logging.info(f'dump config to {config_file_name}') 19 | with open(f'./{config_file_name}', 'w') as file: 20 | yaml.dump(config, file) 21 | # run python main.py 22 | logging.info(f'training with this config {times} times') 23 | 24 | 25 | for i in range(start_id, start_id + times): 26 | log_file = f'log_{i}_{combination_name}.log' 27 | os.system(f"python3.8 {os.path.expanduser( '~' )}/orion/related/baselines/main.py --config ./{config_file_name}") 28 | print(f"{combination_name}.log.json") 29 | os.system(f"cp {combination_name}.log.json ../../results/mps/{mnames[model0]}_{mnames[model1]}_{i}.json") 30 | 31 | 32 | 33 | if __name__ == "__main__": 34 | logging.basicConfig( 35 | level=logging.INFO, 36 | format='%(asctime)s %(levelname)-8s: [%(filename)s:%(lineno)d] %(message)s', 37 | datefmt='%d/%m/%Y %H:%M:%S', 38 | handlers=[ 39 | # output to console 40 | logging.StreamHandler(), 41 | ] 42 | ) 43 | with open('./config.yaml', 'r') as file: 44 | default_full_config = yaml.load(file, Loader=yaml.FullLoader) 45 | 46 | # ----configuration region started---- 47 | model0_mode = 'eval' 48 | model1_mode = 'train' 49 | 50 | policy = 'MPS' 51 | 52 | train_batch_sizes = { 53 | 'resnet50': 32, 54 | 'mobilenet_v2': 64, 55 | 'resnet101': 32, 56 | 'bert': 8, 57 | 'transformer': 8 58 | } 59 | 60 | eval_batch_sizes = { 61 | 'resnet50': 4, 62 | 'mobilenet_v2': 4, 63 | 'resnet101': 4, 64 | 'bert': 2, 65 | 'transformer': 4 66 | } 67 | 68 | request_rates = { 69 | 'resnet50': 15, 70 | 'mobilenet_v2': 40, 71 | } 72 | 73 | num_reqs = { 74 | 'resnet50': 9200, 75 | 'mobilenet_v2': 12000, 76 | } 77 | 78 | models = ['resnet50', 'mobilenet_v2', 'resnet101', 'bert', 'transformer'] 79 | combinations = itertools.product(models[:2], models) 80 | times = 3 81 | start_id = 0 82 | distribution = 'poisson' 83 | 84 | 85 | # ----configuration region ended---- 86 | 87 | default_full_config['shared_config']['distribution'] = distribution 88 | 89 | for model0, model1 in combinations: 90 | default_full_config['models']['model0']['name'] = model0 91 | default_full_config['models']['model0']['mode'] = model0_mode 92 | default_full_config['models']['model1']['name'] = model1 if model0 != model1 else model1 + '-1' 93 | default_full_config['models']['model1']['mode'] = model1_mode 94 | default_full_config['policy'] = policy 95 | 96 | if model0 != model1: 97 | 98 | default_full_config[model0]['request_rate'] = request_rates[model0] 99 | default_full_config[model0]['num_iterations'] = num_reqs[model0] 100 | 101 | default_full_config[model0]['batch_size'] = eval_batch_sizes[model0] 102 | default_full_config[model1]['batch_size'] = train_batch_sizes[model1] 103 | 104 | combination_name = f'{model0_mode}-{model0}{model1_mode}-{model1}' 105 | run(model0, model1, default_full_config, combination_name, times=times, start_id=start_id) 106 | else: 107 | model1_with_suffix = model1 + '-1' 108 | 109 | default_full_config[model0]['request_rate'] = request_rates[model0] 110 | default_full_config[model0]['num_iterations'] = num_reqs[model0] 111 | 112 | default_full_config[model0]['batch_size'] = eval_batch_sizes[model0] 113 | default_full_config[model1_with_suffix]['batch_size'] = train_batch_sizes[model1] 114 | 115 | combination_name = f'{model0_mode}-{model0}{model1_mode}-{model1_with_suffix}' 116 | run(model0, model1, default_full_config, combination_name, times=times, start_id=start_id) -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/rnet101_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet101", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb1", 6 | "num_kernels": 1219, 7 | "additional_num_kernels": 1847, 8 | "num_iters": 1200000, 9 | "args": { 10 | "model_name": "resnet101", 11 | "batchsize": 32, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | }, 18 | { 19 | "arch": "mobilenet_v2", 20 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 21 | "num_kernels": 152, 22 | "num_iters": 12000, 23 | "args": { 24 | "model_name": "mobilenet_v2", 25 | "batchsize": 4, 26 | "rps": 40, 27 | "uniform": false, 28 | "dummy_data": true, 29 | "train": false 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/rnet101_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet101", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb1", 6 | "num_kernels": 1219, 7 | "additional_num_kernels": 1847, 8 | "num_iters": 920000, 9 | "args": { 10 | "model_name": "resnet101", 11 | "batchsize": 32, 12 | "rps": 0, 13 | "uniform": true, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | }, 18 | { 19 | "arch": "resnet50", 20 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 21 | "num_kernels": 175, 22 | "num_iters": 9200, 23 | "args": { 24 | "model_name": "resnet50", 25 | "batchsize": 4, 26 | "rps": 15, 27 | "uniform": false, 28 | "dummy_data": true, 29 | "train": false 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/rnet_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb1", 6 | "num_kernels": 624, 7 | "additional_num_kernels": 946, 8 | "num_iters": 1200000, 9 | "args": { 10 | "model_name": "resnet50", 11 | "batchsize": 32, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | }, 18 | { 19 | "arch": "mobilenet_v2", 20 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 21 | "num_kernels": 152, 22 | "num_iters": 12000, 23 | "args": { 24 | "model_name": "mobilenet_v2", 25 | "batchsize": 4, 26 | "rps": 40, 27 | "uniform": false, 28 | "dummy_data": true, 29 | "train": false 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/rnet_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb1", 6 | "num_kernels": 624, 7 | "additional_num_kernels": 946, 8 | "num_iters": 550000, 9 | "args": { 10 | "model_name": "resnet50", 11 | "batchsize": 32, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": true, 15 | "train": true 16 | } 17 | }, 18 | { 19 | "arch": "resnet50", 20 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 21 | "num_kernels": 175, 22 | "num_iters": 9200, 23 | "args": { 24 | "model_name": "resnet50", 25 | "batchsize": 4, 26 | "rps": 15, 27 | "uniform": false, 28 | "dummy_data": true, 29 | "train": false 30 | } 31 | } 32 | ] 33 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/trans_mnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "transformer", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1", 6 | "num_kernels": 4396, 7 | "additional_num_kernels": 4354, 8 | "num_iters": 1200000, 9 | "args": { 10 | "batchsize": 8, 11 | "rps": 0, 12 | "uniform": false, 13 | "dummy_data": true, 14 | "train": true 15 | } 16 | }, 17 | { 18 | "arch": "mobilenet_v2", 19 | "kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd", 20 | "num_kernels": 152, 21 | "num_iters": 12000, 22 | "args": { 23 | "model_name": "mobilenet_v2", 24 | "batchsize": 4, 25 | "rps": 40, 26 | "uniform": false, 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/config_files/trans_rnet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "transformer", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0", 5 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1", 6 | "num_kernels": 4396, 7 | "additional_num_kernels": 4354, 8 | "num_iters": 550000, 9 | "args": { 10 | "batchsize": 8, 11 | "rps": 0, 12 | "uniform": false, 13 | "dummy_data": true, 14 | "train": true 15 | } 16 | }, 17 | { 18 | "arch": "resnet50", 19 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 20 | "num_kernels": 175, 21 | "num_iters": 9200, 22 | "args": { 23 | "model_name": "resnet50", 24 | "batchsize": 4, 25 | "rps": 15, 26 | "uniform": false, 27 | "dummy_data": true, 28 | "train": false 29 | } 30 | } 31 | ] -------------------------------------------------------------------------------- /artifact_evaluation/fig7/gather_latency.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import json 5 | import itertools 6 | 7 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 8 | baselines = ['reef', 'orion', 'mps', 'ideal'] 9 | 10 | hp_list = ['ResNet50', 'MobileNetV2'] 11 | be_list = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 12 | num_runs = 3 13 | 14 | # ideal 15 | df_ideal = pd.DataFrame(0.0, index=models, columns=models) 16 | for hp in hp_list: 17 | results = [] 18 | for run in range(num_runs): 19 | input_file = f"results/ideal/{hp}_{run}_hp.json" 20 | with open(input_file, 'r') as f: 21 | data = json.load(f) 22 | results.append(float(data['p95_latency'])) 23 | print(hp, results) 24 | for be in be_list: 25 | df_ideal.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}" 26 | df_ideal.to_csv(f'results/ideal_latency.csv') 27 | print(df_ideal) 28 | 29 | # mps 30 | df_mps = pd.DataFrame(0.0, index=models, columns=models) 31 | for be,hp in itertools.product(be_list, hp_list): 32 | results = [] 33 | for run in range(num_runs): 34 | input_file = f"results/mps/{hp}_{be}_{run}.json" 35 | with open(input_file, 'r') as f: 36 | data = json.load(f) 37 | results.append(float(data['p95-latency-0'])) 38 | df_mps.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}" 39 | df_mps.to_csv(f'results/mps_latency.csv') 40 | print(df_mps) 41 | 42 | # get rest baselines 43 | for baseline in baselines[:-2]: 44 | df = pd.DataFrame(0.0, index=models, columns=models) 45 | for be,hp in itertools.product(be_list, hp_list): 46 | results = [] 47 | for run in range(num_runs): 48 | input_file = f"results/{baseline}/{be}_{hp}_{run}_hp.json" 49 | with open(input_file, 'r') as f: 50 | data = json.load(f) 51 | results.append(float(data['p95_latency'])) 52 | df.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}" 53 | df.to_csv(f'results/{baseline}_latency.csv') 54 | print(baseline, df) 55 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/gather_throughput.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import json 5 | import itertools 6 | 7 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 8 | baselines = ['reef', 'orion', 'mps', 'ideal'] 9 | 10 | hp_list = ['ResNet50', 'MobileNetV2'] 11 | be_list = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 12 | num_runs = 3 13 | 14 | df_hp_ideal_throughput = pd.DataFrame("0", index=models, columns=models) 15 | df_be_ideal_throughput = pd.DataFrame("0", index=models, columns=models) 16 | for hp in hp_list: 17 | res_hp = [] 18 | for run in range(num_runs): 19 | input_file_hp = f"results/ideal/{hp}_{run}_hp.json" 20 | with open(input_file_hp, 'r') as f: 21 | data = json.load(f) 22 | res_hp.append(float(data['throughput'])) 23 | for be in be_list: 24 | print(round(np.average(res_hp),2)) 25 | df_hp_ideal_throughput.at[be, hp] = f"{round(np.average(res_hp),2)}/{round(np.std(res_hp),2)}" 26 | 27 | for be in be_list: 28 | res_be = [] 29 | for run in range(num_runs): 30 | input_file_be = f"results/ideal/{be}_{run}_be.json" 31 | with open(input_file_be, 'r') as f: 32 | data = json.load(f) 33 | res_be.append(float(data['throughput'])) 34 | for hp in hp_list: 35 | df_be_ideal_throughput.at[be, hp] = f"{round(np.average(res_be),2)}/{round(np.std(res_be),2)}" 36 | 37 | df_hp_ideal_throughput.to_csv(f'results/inf_throughput_ideal.csv') 38 | df_be_ideal_throughput.to_csv(f'results/train_throughput_ideal.csv') 39 | print("ideal") 40 | print(df_hp_ideal_throughput) 41 | print(df_be_ideal_throughput) 42 | 43 | df_hp_mps_throughput = pd.DataFrame("0", index=models, columns=models) 44 | df_be_mps_throughput = pd.DataFrame("0", index=models, columns=models) 45 | for be,hp in itertools.product(be_list, hp_list): 46 | res_hp = [] 47 | res_be = [] 48 | for run in range(num_runs): 49 | input_file_hp = f"results/mps/{hp}_{be}_{run}.json" 50 | with open(input_file_hp, 'r') as f: 51 | data = json.load(f) 52 | res_be.append(float(data['throughput-1'])) 53 | res_hp.append(float(data['throughput-0'])) 54 | 55 | df_hp_mps_throughput.at[be, hp] = f"{round(np.average(res_hp),2)}/{round(np.std(res_hp),2)}" 56 | df_be_mps_throughput.at[be, hp] = f"{round(np.average(res_be),2)}/{round(np.std(res_be),2)}" 57 | 58 | df_hp_mps_throughput.to_csv(f'results/inf_throughput_mps.csv') 59 | df_be_mps_throughput.to_csv(f'results/train_throughput_mps.csv') 60 | print("mps") 61 | print(df_hp_mps_throughput) 62 | print(df_be_mps_throughput) 63 | 64 | for baseline in baselines[:-2]: 65 | df_hp_throughput = pd.DataFrame("0", index=models, columns=models) 66 | df_be_throughput = pd.DataFrame("0", index=models, columns=models) 67 | for be,hp in itertools.product(be_list, hp_list): 68 | res_hp = [] 69 | res_be = [] 70 | for run in range(num_runs): 71 | input_file_hp = f"results/{baseline}/{be}_{hp}_{run}_hp.json" 72 | with open(input_file_hp, 'r') as f: 73 | data = json.load(f) 74 | res_hp.append(float(data['throughput'])) 75 | 76 | input_file_be = f"results/{baseline}/{be}_{hp}_{run}_be.json" 77 | with open(input_file_be, 'r') as f: 78 | data = json.load(f) 79 | res_be.append(float(data['throughput'])) 80 | 81 | df_hp_throughput.at[be, hp] = f"{round(np.average(res_hp),2)}/{round(np.std(res_hp),2)}" 82 | df_be_throughput.at[be, hp] = f"{round(np.average(res_be),2)}/{round(np.std(res_be),2)}" 83 | 84 | print(baseline) 85 | print(df_hp_throughput) 86 | print(df_be_throughput) 87 | 88 | df_hp_throughput.to_csv(f'results/inf_throughput_{baseline}.csv') 89 | df_be_throughput.to_csv(f'results/train_throughput_{baseline}.csv') 90 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/plot_latency.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 6 | model2id = { 7 | 'ResNet50': 0, 8 | 'MobileNetV2': 1, 9 | 'ResNet101': 2, 10 | 'BERT': 3, 11 | 'Transformer': 4 12 | } 13 | 14 | # %% 15 | 16 | def get_data(csv_file, error=False): 17 | df = pd.read_csv(csv_file) 18 | df = df.drop(df.columns[0], axis=1) 19 | df.index = models 20 | 21 | for model_row in models: 22 | for model_col in models[:2]: 23 | cell = df.at[model_row, model_col] 24 | df.at[model_row, model_col] = float(cell.split('/')[0]) #float(cell.split('/')[1]) if error else float(cell.split('/')[0]) 25 | if not error: 26 | return df.mean() 27 | 28 | df = df.std() 29 | return df 30 | 31 | 32 | 33 | # %% 34 | method2file = { 35 | #'Temporal Sharing': 'results/latency/sequential.csv', 36 | #'Streams': 'results/latency/streams.csv', 37 | 'MPS': 'results/mps_latency.csv', 38 | 'REEF policy': 'results/reef_latency.csv', 39 | 'Orion': 'results/orion_latency.csv', 40 | 'Ideal': 'results/ideal_latency.csv' 41 | } 42 | 43 | label_font_size = 22 44 | methods = list(method2file.keys()) 45 | 46 | method2data = {} 47 | method2err = {} 48 | 49 | for method, file in method2file.items(): 50 | method2data[method] = get_data(file) 51 | method2err[method] = get_data(file, error=True) 52 | 53 | width = 0.15 54 | fig, ax = plt.subplots(figsize=(14, 8)) 55 | x = np.arange(len(models[:2])) 56 | bars = [] 57 | for method_id, method in enumerate(methods): 58 | 59 | print(x,method2data[method]) 60 | bar = ax.bar( 61 | x + width * method_id, method2data[method][:2], width, 62 | label=method, yerr=method2err[method][:2], 63 | align='edge' 64 | ) 65 | bars.append(bar) 66 | 67 | #for i,r in enumerate(bars[0]): 68 | #plt.text(r.get_x() + r.get_width()/2.0, 300, f"{method2data['Temporal Sharing'][i]:.0f}", ha='center', va='bottom', fontsize=13) 69 | #print(r.get_height()) 70 | 71 | x_tick_positions = x + width * len(methods) / 2 72 | ax.set_xticks( 73 | ticks=x_tick_positions, 74 | labels=models[:2], fontsize=22 75 | ) 76 | plt.yticks(fontsize=22) 77 | #ax.set_ylim(0,300) 78 | ax.set_ylabel('Average p95 inference latency (ms)', fontsize=label_font_size) 79 | ax.set_xlabel('High-priority inference job', fontsize=label_font_size) 80 | 81 | plt.tight_layout() 82 | handles, labels = ax.get_legend_handles_labels() 83 | fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(1.0, 1.08),ncols=6, fontsize=18) 84 | 85 | #plt.show() 86 | plt.savefig("fig7a.png", bbox_inches="tight") 87 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/plot_throughput.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer'] 6 | model2id = { 7 | 'ResNet50': 0, 8 | 'MobileNetV2': 1, 9 | 'ResNet101': 2, 10 | 'BERT': 3, 11 | 'Transformer': 4 12 | } 13 | 14 | # %% 15 | 16 | def get_data(csv_files, error=False): 17 | 18 | df_train_input = pd.read_csv(csv_files[0]) 19 | df_train_input = df_train_input.drop(df_train_input.columns[0], axis=1) 20 | df_train_input.index = models 21 | 22 | df_inf_input = pd.read_csv(csv_files[1]) 23 | df_inf_input = df_inf_input.drop(df_inf_input.columns[0], axis=1) 24 | df_inf_input.index = models 25 | 26 | df_train = pd.DataFrame() 27 | df_inf_new = pd.DataFrame() 28 | 29 | for model_row in models: 30 | for model_col in models[:2]: 31 | cell_train = df_train_input.at[model_row, model_col] 32 | cell_inf = df_inf_input.at[model_row, model_col] 33 | 34 | df_train.at[model_row, model_col] = float(cell_train.split('/')[0]) 35 | df_inf_new.at[model_row, model_col] = float(cell_inf.split('/')[0]) 36 | if error: 37 | return df_train.std(), df_inf_new.std() 38 | else: 39 | return df_train.mean(), df_inf_new.mean() 40 | 41 | # %% 42 | method2file = { 43 | 'MPS': ['results/train_throughput_mps.csv', 'results/inf_throughput_mps.csv'], 44 | 'REEF policy': ['results/train_throughput_reef.csv', 'results/inf_throughput_reef.csv'], 45 | 'Orion': ['results/train_throughput_orion.csv', 'results/inf_throughput_orion.csv'], 46 | 'Ideal': ['results/train_throughput_ideal.csv', 'results/inf_throughput_ideal.csv'], 47 | } 48 | 49 | label_font_size = 22 50 | methods = list(method2file.keys()) 51 | 52 | method2data = {} 53 | method2err = {} 54 | 55 | for method, file in method2file.items(): 56 | method2data[method] = get_data(file) 57 | method2err[method] = get_data(file, error=True) 58 | 59 | width = 0.15 60 | fig, ax = plt.subplots(figsize=(14, 8)) 61 | x = np.arange(len(models[:2])) 62 | colors = ["royalblue", "darkorange", "green", "red", "mediumpurple", "saddlebrown"] 63 | 64 | for method_id, method in enumerate(methods): 65 | 66 | ax.bar( 67 | x + width * method_id, method2data[method][1][:2], width, yerr=method2err[method][1][:2], 68 | align='edge', hatch="\\", color = colors[method_id], 69 | ) 70 | ax.bar( 71 | x + width * method_id, method2data[method][0][:2], width, 72 | label=method, yerr=method2err[method][0][:2], bottom=method2data[method][1][:2], 73 | align='edge', hatch="/", color = colors[method_id], alpha=0.6 74 | ) 75 | 76 | x_tick_positions = x + width * len(methods) / 2 77 | ax.set_xticks( 78 | ticks=x_tick_positions, 79 | labels=models[:2], fontsize=22 80 | ) 81 | plt.yticks(fontsize=22) 82 | ax.set_ylabel('Total Throughput (requests/sec)', fontsize=label_font_size) 83 | ax.set_xlabel('High-priority Inference job', fontsize=label_font_size) 84 | 85 | plt.tight_layout() 86 | handles, labels = ax.get_legend_handles_labels() 87 | fig.legend(handles, labels, loc='upper right', prop={'size': 20}, borderaxespad=2) 88 | 89 | #plt.show() 90 | plt.savefig("fig7b.png", bbox_inches="tight") 91 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/prep_dirs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir results 4 | mkdir results/ideal 5 | mkdir results/reef 6 | mkdir results/orion 7 | mkdir results/mps -------------------------------------------------------------------------------- /artifact_evaluation/fig7/run_ideal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | num_runs = 3 5 | trace_files_hp = [ 6 | ("ResNet50", "rnet"), 7 | ("MobileNetV2", "mnet"), 8 | ] 9 | 10 | trace_files_be = [ 11 | ("ResNet50", "rnet"), 12 | ("MobileNetV2", "mnet"), 13 | ("ResNet101", "rnet101"), 14 | ("BERT", "bert"), 15 | ("Transformer", "trans") 16 | ] 17 | 18 | for (model, f) in trace_files_hp: 19 | for run in range(num_runs): 20 | print(model, run, flush=True) 21 | # run 22 | file_path = f"config_files/ideal/{f}_inf.json" 23 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path}") 24 | 25 | # copy results 26 | os.system(f"cp client_0.json results/ideal/{model}_{run}_hp.json") 27 | os.system("rm client_0.json") 28 | 29 | for (model, f) in trace_files_be: 30 | for run in range(num_runs): 31 | print(model, run, flush=True) 32 | # run 33 | file_path = f"config_files/ideal/{f}_train.json" 34 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path}") 35 | 36 | # copy results 37 | os.system(f"cp client_0.json results/ideal/{model}_{run}_be.json") 38 | os.system("rm client_0.json") 39 | -------------------------------------------------------------------------------- /artifact_evaluation/fig7/run_orion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | num_runs = 3 5 | trace_files = [ 6 | ("ResNet50", "ResNet50", "rnet_rnet", 160000), 7 | ("ResNet50", "MobileNetV2", "rnet_mnet", 100000), 8 | ("MobileNetV2", "ResNet50", "mnet_rnet", 160000), 9 | ("MobileNetV2", "MobileNetV2", "mnet_mnet", 100000), 10 | ("ResNet101", "ResNet50", "rnet101_rnet", 160000), 11 | ("ResNet101", "MobileNetV2", "rnet101_mnet", 100000), 12 | ("BERT", "ResNet50", "bert_rnet", 160000), 13 | ("BERT", "MobileNetV2", "bert_mnet", 100000), 14 | ("Transformer", "ResNet50", "trans_rnet", 160000), 15 | ("Transformer", "MobileNetV2", "trans_mnet", 100000), 16 | ] 17 | 18 | for (be, hp, f, max_be_duration) in trace_files: 19 | for run in range(num_runs): 20 | print(be, hp, run, flush=True) 21 | # run 22 | file_path = f"config_files/{f}.json" 23 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path} --orion_max_be_duration {max_be_duration}") 24 | 25 | # copy results 26 | os.system(f"cp client_1.json results/orion/{be}_{hp}_{run}_hp.json") 27 | os.system(f"cp client_0.json results/orion/{be}_{hp}_{run}_be.json") 28 | 29 | os.system("rm client_1.json") 30 | os.system("rm client_0.json") -------------------------------------------------------------------------------- /artifact_evaluation/fig7/run_reef.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | num_runs = 3 5 | trace_files = [ 6 | ("ResNet50", "ResNet50", "rnet_rnet"), 7 | ("ResNet50", "MobileNetV2", "rnet_mnet"), 8 | ("MobileNetV2", "ResNet50", "mnet_rnet"), 9 | ("MobileNetV2", "MobileNetV2", "mnet_mnet"), 10 | ("ResNet101", "ResNet50", "rnet101_rnet"), 11 | ("ResNet101", "MobileNetV2", "rnet101_mnet"), 12 | ("BERT", "ResNet50", "bert_rnet"), 13 | ("BERT", "MobileNetV2", "bert_mnet"), 14 | ("Transformer", "ResNet50", "trans_rnet"), 15 | ("Transformer", "MobileNetV2", "trans_mnet"), 16 | ] 17 | 18 | for (be, hp, f) in trace_files: 19 | for run in range(num_runs): 20 | print(be, hp, run, flush=True) 21 | # run 22 | file_path = f"config_files/{f}.json" 23 | os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo reef --config_file {file_path} --reef_depth 12") 24 | 25 | # copy results 26 | os.system(f"cp client_1.json results/reef/{be}_{hp}_{run}_hp.json") 27 | os.system(f"cp client_0.json results/reef/{be}_{hp}_{run}_be.json") 28 | 29 | os.system("rm client_1.json") 30 | os.system("rm client_0.json") 31 | -------------------------------------------------------------------------------- /benchmarking/be.json: -------------------------------------------------------------------------------- 1 | {"throughput": 4.434374429312142} -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/compute_optimal.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | ifile = sys.argv[1] 4 | dur_total = 0 5 | 6 | with open(ifile, 'r') as f: 7 | lines = f.readlines() 8 | for l in lines[1:]: 9 | tokens = l.split(",") 10 | sms_used = int(tokens[-2]) 11 | dur = float(tokens[-1])/1000 12 | if (sms_used>80): 13 | dur_total += dur 14 | 15 | dur_total_ms = dur_total/1000 16 | print(dur_total_ms*2) 17 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/conv_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | from torchvision import models, datasets, transforms 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import torch.nn.functional as F 11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 12 | from datetime import timedelta 13 | import random 14 | import numpy as np 15 | import time 16 | import os 17 | import argparse 18 | import threading 19 | 20 | class Model(torch.nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False) 24 | 25 | def forward(self, x): 26 | for i in range(25): 27 | y = self.conv(x) 28 | 29 | 30 | def conv_loop(batchsize, train, local_rank, barriers, tid): 31 | 32 | print(batchsize, local_rank, barriers, tid) 33 | barriers[0].wait() 34 | 35 | data = torch.rand([batchsize, 3, 224, 224]).to(local_rank).contiguous() 36 | model = Model() 37 | model = model.to(0) 38 | 39 | if train: 40 | model.train() 41 | else: 42 | model.eval() 43 | 44 | for i in range(10): 45 | print("Start epoch: ", i) 46 | 47 | start = time.time() 48 | start_iter = time.time() 49 | 50 | batch_idx = 0 51 | 52 | while batch_idx < 1: 53 | 54 | print(f"submit!, batch_idx is {batch_idx}") 55 | 56 | if train: 57 | output = model(data) 58 | else: 59 | with torch.no_grad(): 60 | output = model(data) 61 | 62 | 63 | batch_idx += 1 64 | 65 | start_iter = time.time() 66 | 67 | #barriers[0].wait() 68 | if i < 9: 69 | barriers[0].wait() 70 | print(f"{tid}, Epoch done!") 71 | 72 | print("Finished! Ready to join!") 73 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/examples/basic_config_bert.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | { 4 | "arch": "bert", 5 | "kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0", 6 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1", 7 | "num_kernels": 4777, 8 | "additional_num_kernels": 4777, 9 | "num_iters": 100, 10 | "args": { 11 | "batchsize": 8, 12 | "rps": 0, 13 | "uniform": false, 14 | "dummy_data": false, 15 | "train": true 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/examples/basic_config_transformer.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | { 4 | "arch": "transformer", 5 | "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0", 6 | "additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1", 7 | "num_kernels": 4396, 8 | "additional_num_kernels": 4354, 9 | "num_iters": 100, 10 | "args": { 11 | "batchsize": 8, 12 | "rps": 0, 13 | "dummy_data": true, 14 | "uniform": false, 15 | "train": true 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/examples/basic_config_vision.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 2000, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "rps": 30, 11 | "uniform": true, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | } 16 | 17 | ] 18 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/extract_meas.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | input_file = sys.argv[1] 4 | 5 | i=0 6 | ar = [0, 0] 7 | ar_str = [] 8 | 9 | 10 | # with open(input_file, 'r') as f: 11 | # while (1): 12 | # l = f.readline() 13 | # i+=1 14 | # print(i, l) 15 | 16 | 17 | tt = [] 18 | iters = [] 19 | with open(input_file, 'r') as f: 20 | lines = f.readlines() 21 | for i,l in enumerate(lines): 22 | 23 | # if 'p50' in l: 24 | # tokens = l.split(",") 25 | # #print(l) 26 | # if "Client 0" in tokens[0]: 27 | # ar[0] = round(float(tokens[2].split(" ")[-2])*1000, 2) 28 | # else: 29 | # ar[1] = round(float(tokens[2].split(" ")[-2])*1000, 2) 30 | # i += 1 31 | # if (i%1==0): 32 | # #s = f"{ar[0]}/{ar[1]}" 33 | # s = f"{ar[1]}" 34 | # ar_str.append(s) 35 | # if (i==10): 36 | # i=0 37 | 38 | if 'Total loop' in l and 'Client' not in l: 39 | tokens = l.split(" ") 40 | tt.append(round(float(tokens[-2]),2)) 41 | if '=======' in l: 42 | tokens = l.split(" ") 43 | iters.append(int(tokens[-2])) 44 | print(len(tt)) 45 | 46 | #for i in range(5): 47 | # print(f"{ar_str[5*i]},{ar_str[5*i+1]},{ar_str[5*i+2]},{ar_str[5*i+3]},{ar_str[5*i+4]}") 48 | 49 | # for i in range(5): 50 | # print(f"{tt[5*i]},{tt[5*i+1]},{tt[5*i+2]},{tt[5*i+3]},{tt[5*i+4]}") 51 | 52 | # for i in range(5): 53 | # print(f"{iters[5*i]},{iters[5*i+1]},{iters[5*i+2]},{iters[5*i+3]},{iters[5*i+4]}") 54 | 55 | inf_requests = [9200, 12000, 5500, 1200, 3400] 56 | 57 | #print("--------- High Priority Throughput:") 58 | # hp_th = [] 59 | # for i in range(len(tt)): 60 | # hp_th.append(round(inf_requests[i%5]/tt[i],2)) 61 | # for i in range(5): 62 | # print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}") 63 | 64 | # #print("--------- Low Priority Throughput:") 65 | 66 | hp_th = [] 67 | for i in range(len(tt)): 68 | hp_th.append(round(iters[i]/tt[i],2)) 69 | for i in range(5): 70 | print(f"{iters[5*i]},{iters[5*i+1]},{iters[5*i+2]},{iters[5*i+3]},{iters[5*i+4]}") 71 | 72 | 73 | # print("--------- High Priority Throughput:") 74 | # hp_th = [] 75 | # for i in range(len(tt)): 76 | # hp_th.append(round(1000/tt[i],2)) 77 | # for i in range(5): 78 | # print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}") 79 | 80 | # print("--------- Low Priority Throughput:") 81 | # hp_th = [] 82 | # for i in range(len(tt)): 83 | # hp_th.append(round(iters[i]/tt[i],2)) 84 | # for i in range(5): 85 | # print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}") 86 | 87 | 88 | print("--------- Total Throughput:") 89 | hp_th = [] 90 | for i in range(len(tt)): 91 | hp_th.append(round((iters[i]+1000)/tt[i],2)) 92 | for i in range(5): 93 | print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}") 94 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/toy_models/bnorm_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | from torchvision import models, datasets, transforms 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import torch.nn.functional as F 11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 12 | from datetime import timedelta 13 | import random 14 | import numpy as np 15 | import time 16 | import os 17 | import argparse 18 | import threading 19 | 20 | class Model(torch.nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.bn = torch.nn.BatchNorm2d(256) 24 | 25 | def forward(self, x): 26 | for i in range(25): 27 | y = self.bn(x) 28 | 29 | 30 | def bnorm_loop(batchsize, train, local_rank, barriers, tid): 31 | 32 | print(batchsize, local_rank, barriers, tid) 33 | barriers[0].wait() 34 | 35 | data = torch.rand([batchsize, 256, 112, 112]).to(local_rank).contiguous() 36 | model = Model() 37 | model = model.to(0) 38 | 39 | if train: 40 | model.train() 41 | else: 42 | model.eval() 43 | 44 | for i in range(10): 45 | print("Start epoch: ", i) 46 | 47 | start = time.time() 48 | start_iter = time.time() 49 | 50 | batch_idx = 0 51 | 52 | while batch_idx < 1: 53 | 54 | print(f"submit!, batch_idx is {batch_idx}") 55 | 56 | if train: 57 | output = model(data) 58 | else: 59 | with torch.no_grad(): 60 | output = model(data) 61 | 62 | 63 | batch_idx += 1 64 | 65 | start_iter = time.time() 66 | 67 | # barriers[0].wait() 68 | if i < 9: 69 | barriers[0].wait() 70 | print(f"{tid}, Epoch done!") 71 | 72 | print("Finished! Ready to join!") 73 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/toy_models/conv_bn_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | from torchvision import models, datasets, transforms 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import torch.nn.functional as F 11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 12 | from datetime import timedelta 13 | import random 14 | import numpy as np 15 | import time 16 | import os 17 | import argparse 18 | import threading 19 | 20 | class Model(torch.nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False) 24 | self.bn = torch.nn.BatchNorm2d(64) 25 | self.x = torch.rand([32, 64, 112, 112]).to(0) 26 | 27 | def forward(self, x): 28 | for i in range(25): 29 | y = self.conv(x) 30 | z = self.bn(y) 31 | 32 | 33 | def conv_bn_loop(batchsize, train, local_rank, barriers, tid): 34 | 35 | print(batchsize, local_rank, barriers, tid) 36 | barriers[0].wait() 37 | 38 | data = torch.rand([batchsize, 3, 224, 224]).to(local_rank) 39 | model = Model() 40 | model = model.to(0) 41 | 42 | if train: 43 | model.train() 44 | else: 45 | model.eval() 46 | 47 | for i in range(10): 48 | print("Start epoch: ", i) 49 | 50 | start = time.time() 51 | start_iter = time.time() 52 | 53 | batch_idx = 0 54 | 55 | while batch_idx < 1: 56 | 57 | print(f"submit!, batch_idx is {batch_idx}") 58 | 59 | if train: 60 | output = model(data) 61 | else: 62 | with torch.no_grad(): 63 | output = model(data) 64 | 65 | 66 | batch_idx += 1 67 | 68 | start_iter = time.time() 69 | 70 | #barriers[0].wait() 71 | if i < 9: 72 | barriers[0].wait() 73 | print(f"{tid}, Epoch done!") 74 | 75 | print("Finished! Ready to join!") 76 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/transformer_trainer_torch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import threading 3 | import time 4 | 5 | from mem_transformer import MemTransformerLM 6 | import lamb 7 | import numpy as np 8 | 9 | class DummyDataLoader(): 10 | def __init__(self, batchsize): 11 | self.batchsize = batchsize 12 | self.data = torch.ones((192, self.batchsize), pin_memory=True).to(torch.int64) 13 | self.target = torch.ones((192, self.batchsize), pin_memory=True).to(torch.int64) 14 | 15 | def __iter__(self): 16 | return self 17 | 18 | def __next__(self): 19 | return self.data, self.target 20 | 21 | def transformer_loop(batchsize, train, default, num_iters, rps, uniform, dummy_data, local_rank, start_barriers, end_barriers, tid): 22 | 23 | start_barriers[0].wait() 24 | 25 | if rps > 0: 26 | if uniform: 27 | sleep_times = [1/rps]*num_iters 28 | else: 29 | sleep_times = np.random.exponential(scale=1/rps, size=num_iters) 30 | else: 31 | sleep_times = [0]*num_iters 32 | 33 | if default: 34 | s = torch.cuda.default_stream() 35 | else: 36 | s = torch.cuda.Stream() 37 | timings = [] 38 | 39 | model_config = { 40 | 'n_token': 267735, 41 | 'n_layer': 16, 42 | 'n_head': 8, 43 | 'd_model': 512, 44 | 'd_head': 64, 45 | 'd_inner': 2048, 46 | 'dropout': 0.1, 47 | 'dropatt': 0.0, 48 | 'dtype': None, 49 | 'tie_weight': True, 50 | 'd_embed': 512, 51 | 'div_val': 1, 52 | 'tie_projs': [False, True, True, True], 53 | 'pre_lnorm': False, 54 | 'tgt_len': 192, 55 | 'ext_len': 0, 56 | 'mem_len': 192, 57 | 'cutoffs': [19997, 39997, 199997], 58 | 'same_length': False, 59 | 'attn_type': 0, 60 | 'clamp_len': -1, 61 | 'sample_softmax': -1 62 | } 63 | 64 | train_loader = DummyDataLoader(batchsize) 65 | train_iter = enumerate(train_loader) 66 | batch_idx, batch = next(train_iter) 67 | 68 | model = MemTransformerLM(**model_config).to(0) 69 | 70 | if train: 71 | model.train() 72 | optimizer = lamb.Lamb(model.parameters(), lr=0.1) 73 | else: 74 | model.eval() 75 | 76 | next_startup = time.time() 77 | open_loop = False 78 | timings = [0 for _ in range(num_iters)] 79 | 80 | mems = None 81 | print("before while") 82 | with torch.cuda.stream(s): 83 | for i in range(1): 84 | print("Start epoch: ", i) 85 | 86 | while batch_idx < num_iters: 87 | start = time.time() 88 | 89 | if train: 90 | optimizer.zero_grad() 91 | start_iter = time.time() 92 | data, target = batch[0].to(local_rank), batch[1].to(local_rank) 93 | loss, mems = model(data, target, mems) 94 | loss = loss.float().mean().type_as(loss) 95 | loss.backward() 96 | optimizer.step() 97 | #s.synchronize() 98 | print(f"Client {tid}, iter {batch_idx} took {time.time()-start_iter} sec") 99 | batch_idx,batch = next(train_iter) 100 | if (batch_idx==10): 101 | starttime = time.time() 102 | else: 103 | with torch.no_grad(): 104 | cur_time = time.time() 105 | ###### OPEN LOOP ##### 106 | if (cur_time >= next_startup): 107 | print(f"Client {tid} submit!, batch_idx is {batch_idx}") 108 | data, target = batch[0].to(local_rank), batch[1].to(local_rank) 109 | output, mems = model(data, target, mems) 110 | s.synchronize() 111 | timings[batch_idx] = time.time()-next_startup 112 | print(f"It took {timings[batch_idx]} sec") 113 | next_startup += sleep_times[batch_idx] 114 | batch_idx,batch = next(train_iter) 115 | if (batch_idx==10): 116 | starttime = time.time() 117 | 118 | print(f"FINISHED! It took {time.time()-starttime} sec") 119 | end_barriers[0].wait() 120 | 121 | #print(f"Time is {time.time()-starttime} sec") 122 | if not train: 123 | timings = timings[2:] 124 | p50 = np.percentile(timings, 50) 125 | p95 = np.percentile(timings, 95) 126 | p99 = np.percentile(timings, 99) 127 | 128 | print(f"Client {tid} finished! p50: {p50} sec, p95: {p95} sec, p99: {p99} sec") 129 | print(f"Total time is {time.time()-starttime} sec") 130 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/utility_scripts/check_unknown.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | ifile = sys.argv[1] 5 | 6 | sms_used = [] 7 | durations = [] 8 | durations_all = [] 9 | with open(ifile, 'r') as f: 10 | lines = f.readlines() 11 | for l in lines[1:]: 12 | tokens = l.split(",") 13 | profile = int(tokens[1]) 14 | if profile==-1: 15 | sms_used.append(int(tokens[-2])) 16 | durations.append(float(tokens[-1])/1000) 17 | durations_all.append(float(tokens[-1])/1000) 18 | 19 | np.set_printoptions(threshold=np.inf) 20 | print(np.sort(sms_used)) 21 | print(np.sort(durations)) 22 | print(len(sms_used)/len(durations_all)) 23 | #print(f"average: {np.average(np.asarray(durations))}") 24 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/utility_scripts/compute_average.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | inf_durations={ 5 | #"resnet50": 6280, 6 | # "mobilenet": 4940, 7 | # "resnet101": 10500, 8 | # "bert": 49020, 9 | "transformer": 17000 10 | } 11 | 12 | ifile = sys.argv[1] 13 | 14 | durations = [] 15 | sms_used = [] 16 | 17 | in_between = [] 18 | 19 | with open(ifile, 'r') as f: 20 | lines = f.readlines() 21 | for l in lines[1:]: 22 | tokens = l.split(",") 23 | sms_used.append(int(tokens[-2])) 24 | dur = float(tokens[-1])/1000 25 | durations.append(dur) 26 | if (dur>=320 and dur<=350): 27 | print(l) 28 | in_between.append(dur) 29 | 30 | avg_duration = np.average(np.asarray(durations)) 31 | max_duration = max(durations) 32 | # print(np.sort(durations)) 33 | # print(np.sort(sms_used)) 34 | 35 | p50 = np.percentile(durations, 50) 36 | p75 = np.percentile(durations, 75) 37 | p95 = np.percentile(durations, 95) 38 | p99 = np.percentile(durations, 99) 39 | 40 | print(len(in_between)) 41 | 42 | for hp_inference in inf_durations: 43 | D = (0.025 * inf_durations[hp_inference])/avg_duration 44 | print(f"{hp_inference}, Average duration: {avg_duration} us, max duration is {max_duration} us, hp duration is {inf_durations[hp_inference]} us, D is {D}") -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/utility_scripts/download_imagenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # a script to download the image net dataset 3 | set -x -e 4 | DATA_DIR=${1:-/cluster/scratch/xianma/vision} 5 | aria2c -c -x 10 -s 10 -d "$DATA_DIR" --download-result=full https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar 6 | # now the tar lives in DATA_DIR 7 | cd "$DATA_DIR" 8 | mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train 9 | tar -xvf ILSVRC2012_img_train.tar 10 | # move the entire compressed file out of the train folder as the folder should contain actual data 11 | mv ILSVRC2012_img_train.tar ../ 12 | 13 | # the last line should be executed non-interactively (e.g. as a sbatch job) because it is really time-consuming, to unzip each tar 14 | # find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done 15 | -------------------------------------------------------------------------------- /benchmarking/benchmark_suite/utility_scripts/get_avg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | l1 = [[0,0,0,0,0] for _ in range(5)] 4 | l2 = [[0,0,0,0,0] for _ in range(5)] 5 | 6 | with open('case1', 'r') as f1: 7 | f1.readline() 8 | #f1.readline() 9 | l = f1.readlines() 10 | for i in range(5): 11 | t = l[i] 12 | s = t.split(",") 13 | for j in range(5): 14 | l1[i][j] = float(s[j]) 15 | 16 | with open('case2', 'r') as f2: 17 | f2.readline() 18 | #f2.readline() 19 | l = f2.readlines() 20 | 21 | for i in range(5): 22 | t = l[i] 23 | s = t.split(",") 24 | for j in range(5): 25 | l2[i][j] = float(s[j]) 26 | print(l1, l2) 27 | 28 | l_total = [[0,0,0,0,0] for _ in range(5)] 29 | l_std = [[0,0,0,0,0] for _ in range(5)] 30 | l_str = [[0,0,0,0,0] for _ in range(5)] 31 | 32 | for i in range(5): 33 | for j in range(5): 34 | l_total[i][j] = round(np.average([l1[i][j],l2[i][j]]),2) 35 | l_std[i][j] = round(np.std([l1[i][j],l2[i][j]]),2) 36 | l_str[i][j] = str(l_total[i][j]) + "/" + str(l_std[i][j]) 37 | for i in range(5): 38 | print(f"{l_total[i][0]}/{l_std[i][0]},{l_total[i][1]}/{l_std[i][1]},{l_total[i][2]}/{l_std[i][2]},{l_total[i][3]}/{l_std[i][3]},{l_total[i][4]}/{l_std[i][4]}") 39 | -------------------------------------------------------------------------------- /benchmarking/hp.json: -------------------------------------------------------------------------------- 1 | {"p50_latency": 12.213349342346191, "p95_latency": 21.53183221817016, "p99_latency": 24.332609176635742, "throughput": 20.809929229394967} -------------------------------------------------------------------------------- /benchmarking/multi_client_example.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arch": "resnet50", 4 | "kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd", 5 | "num_kernels": 175, 6 | "num_iters": 20000, 7 | "args": { 8 | "model_name": "resnet50", 9 | "batchsize": 4, 10 | "rps": 30, 11 | "uniform": false, 12 | "dummy_data": true, 13 | "train": false 14 | } 15 | }, 16 | { 17 | "arch": "resnet50", 18 | "kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd", 19 | "num_kernels": 175, 20 | "num_iters": 20000, 21 | "args": { 22 | "model_name": "resnet50", 23 | "batchsize": 4, 24 | "rps": 30, 25 | "uniform": false, 26 | "dummy_data": true, 27 | "train": false 28 | } 29 | }, 30 | { 31 | "arch": "resnet50", 32 | "kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd", 33 | "num_kernels": 175, 34 | "num_iters": 20000, 35 | "args": { 36 | "model_name": "resnet50", 37 | "batchsize": 4, 38 | "rps": 30, 39 | "uniform": false, 40 | "dummy_data": true, 41 | "train": false 42 | } 43 | }, 44 | { 45 | "arch": "resnet50", 46 | "kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd", 47 | "num_kernels": 175, 48 | "num_iters": 20000, 49 | "args": { 50 | "model_name": "resnet50", 51 | "batchsize": 4, 52 | "rps": 30, 53 | "uniform": false, 54 | "dummy_data": true, 55 | "train": false 56 | } 57 | }, 58 | { 59 | "arch": "resnet50", 60 | "kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd", 61 | "num_kernels": 175, 62 | "num_iters": 2000, 63 | "args": { 64 | "model_name": "resnet50", 65 | "batchsize": 4, 66 | "rps": 30, 67 | "uniform": false, 68 | "dummy_data": true, 69 | "train": false 70 | } 71 | } 72 | ] 73 | -------------------------------------------------------------------------------- /benchmarking/scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run_traces.py > results/inf_inf_res/reef_updated_poisson_1 4 | python run_traces.py > results/inf_inf_res/reef_updated_poisson_2 5 | -------------------------------------------------------------------------------- /benchmarking/scripts/run_traces.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | trace_files = [ 5 | "rnet_rnet_ti", 6 | "rnet_mnet_ti", 7 | "mnet_rnet_ti", 8 | "mnet_mnet_ti", 9 | "rnet101_rnet_ti", 10 | "rnet101_mnet_ti", 11 | "bert_rnet_ti", 12 | "bert_mnet_ti", 13 | "trans_rnet_ti", 14 | "trans_mnet_ti", 15 | ] 16 | 17 | 18 | 19 | #orion, hp is inference - uniform 20 | # depths = [110000, 100000, 150000, 1250000, 400000, 21 | # 110000, 100000, 150000, 1250000, 400000, 22 | # 110000, 100000, 150000, 1250000, 400000, 23 | # 110000, 100000, 150000, 1250000, 400000, 24 | # 110000, 100000, 150000, 1250000, 400000] 25 | 26 | # orion, hp is inference, threshold is 0.05 27 | # depths = [320000, 300000, 400000, 2500000, 800000, 28 | # 320000, 300000, 400000, 2500000, 800000, 29 | # 320000, 300000, 400000, 2500000, 800000, 30 | # 320000, 300000, 400000, 2500000, 800000, 31 | # 320000, 300000, 400000, 2500000, 800000] 32 | 33 | depths = [ 34 | 6,5,10,48,16, 35 | 8,6,13,58,21, 36 | 8,6,13,60,21, 37 | 16,13,27,123,43, 38 | 22,17,36,170,59, 39 | ] 40 | 41 | limits = [1,1,1,1,1, 42 | 1,1,1,1,1, 43 | 1,1,1,1,1, 44 | 1,1,1,1,1, 45 | 1,1,1,1,1] 46 | updates = [1,1,1,1,1, 47 | 1,1,1,1,1, 48 | 1,1,1,1,1, 49 | 1,1,1,1,1, 50 | 1,1,1,1,1] 51 | 52 | 53 | # # orion, hp is training 54 | # depths = [ 55 | # 1000000, 1000000, 1000000, 40000000, 32000000, 56 | # 1000000, 1000000, 1000000, 40000000, 32000000, 57 | # 1000000, 1000000, 1000000, 40000000, 32000000, 58 | # 1000000, 1000000, 1000000, 40000000, 32000000, 59 | # 1000000, 1000000, 1000000, 40000000, 32000000 60 | # ] 61 | # limits = [ 62 | # 135, 120, 235, 250, 250, 63 | # 135, 120, 235, 250, 250, 64 | # 135, 120, 235, 250, 250, 65 | # 135, 120, 235, 250, 250, 66 | # 135, 120, 235, 250, 250 67 | # ] 68 | # updates = [ 69 | # 768, 733, 1534, 2669, 1622, 70 | # 768, 733, 1534, 2669, 1622, 71 | # 768, 733, 1534, 2669, 1622, 72 | # 768, 733, 1534, 2669, 1622, 73 | # 768, 733, 1534, 2669, 1622 74 | # ] 75 | 76 | 77 | 78 | print(len(trace_files), len(depths)) 79 | assert len(trace_files) == len(depths) 80 | for f,d,l,u in zip(trace_files, depths, limits, updates): 81 | print(f,d, flush=True) 82 | file_path = f"eval/inf_inf/poisson/{f}.json" 83 | os.system(f"python launch_jobs.py {file_path} {d} {l} {u}") 84 | time.sleep(10) 85 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd src/cuda_capture && make libinttemp.so && cd ../../ 4 | cd src/scheduler && make scheduler_eval.so && cd ../../ 5 | -------------------------------------------------------------------------------- /orion_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/orion_architecture.png -------------------------------------------------------------------------------- /profiling/benchmarks/bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import threading 3 | import time 4 | import modeling 5 | 6 | from optimization import BertAdam 7 | 8 | def bert(batchsize, local_rank, do_eval=True, profile=True): 9 | 10 | model_config = { 11 | "attention_probs_dropout_prob": 0.1, 12 | "hidden_act": "gelu", 13 | "hidden_dropout_prob": 0.1, 14 | "hidden_size": 768, 15 | "initializer_range": 0.02, 16 | "intermediate_size": 3072, 17 | "max_position_embeddings": 512, 18 | "num_attention_heads": 12, 19 | "num_hidden_layers": 12, 20 | "type_vocab_size": 2, 21 | "vocab_size": 30522 22 | } 23 | 24 | config = modeling.BertConfig.from_dict(model_config) 25 | # Padding for divisibility by 8 26 | if config.vocab_size % 8 != 0: 27 | config.vocab_size += 8 - (config.vocab_size % 8) 28 | 29 | 30 | input_ids = torch.ones((batchsize, 384)).to(torch.int64).to(0) 31 | segment_ids = torch.ones((batchsize, 384)).to(torch.int64).to(0) 32 | input_mask = torch.ones((batchsize, 384)).to(torch.int64).to(0) 33 | start_positions = torch.zeros((batchsize)).to(torch.int64).to(0) 34 | end_positions = torch.ones((batchsize)).to(torch.int64).to(0) 35 | 36 | 37 | model = modeling.BertForQuestionAnswering(config).to(0) 38 | 39 | if do_eval: 40 | model.eval() 41 | else: 42 | model.train() 43 | param_optimizer = list(model.named_parameters()) 44 | 45 | param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] 46 | 47 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 48 | optimizer_grouped_parameters = [ 49 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 50 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 51 | ] 52 | optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=100) 53 | 54 | batch_idx = 0 55 | torch.cuda.synchronize() 56 | 57 | while batch_idx < 1: 58 | 59 | if batch_idx == 0: 60 | if profile == 'ncu': 61 | torch.cuda.nvtx.range_push("start") 62 | elif profile == 'nsys': 63 | torch.cuda.profiler.cudart().cudaProfilerStart() 64 | 65 | if do_eval: 66 | with torch.no_grad(): 67 | output = model(input_ids, segment_ids, input_mask) 68 | else: 69 | optimizer.zero_grad() 70 | start_logits, end_logits = model(input_ids, segment_ids, input_mask) 71 | ignored_index = start_logits.size(1) 72 | loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index) 73 | start_loss = loss_fct(start_logits, start_positions) 74 | end_loss = loss_fct(end_logits, end_positions) 75 | loss = (start_loss + end_loss) / 2 76 | loss.backward() 77 | optimizer.step() 78 | 79 | if batch_idx == 0: 80 | if profile == 'ncu': 81 | torch.cuda.nvtx.range_pop() 82 | elif profile == 'nsys': 83 | torch.cuda.profiler.cudart().cudaProfilerStop() 84 | 85 | batch_idx += 1 86 | 87 | print("Done!") 88 | 89 | if __name__ == "__main__": 90 | bert(8, 0,False, 'nsys') 91 | -------------------------------------------------------------------------------- /profiling/benchmarks/bnorm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | from torchvision import models, datasets, transforms 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import torch.nn.functional as F 11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 12 | from datetime import timedelta 13 | import random 14 | import numpy as np 15 | import time 16 | import os 17 | import argparse 18 | import threading 19 | 20 | 21 | class Model(torch.nn.Module): 22 | def __init__(self): 23 | super().__init__() 24 | self.bn = torch.nn.BatchNorm2d(64) 25 | 26 | def forward(self, x): 27 | for i in range(25): 28 | x = self.bn(x) 29 | 30 | 31 | def bnorm_loop(batchsize, local_rank, do_eval=True, profile=None): 32 | 33 | print("-------------- thread id: ", threading.get_native_id()) 34 | 35 | data = torch.rand([batchsize, 64, 112, 112]).to(local_rank).contiguous() 36 | model = Model() 37 | model = model.to(0) 38 | 39 | if do_eval: 40 | model.eval() 41 | else: 42 | model.train() 43 | 44 | print("Enter loop!") 45 | 46 | batch_idx = 0 47 | torch.cuda.synchronize() 48 | 49 | while batch_idx < 10: 50 | 51 | if batch_idx == 9: 52 | if profile == 'ncu': 53 | torch.cuda.nvtx.range_push("start") 54 | elif profile == 'nsys': 55 | torch.cuda.profiler.cudart().cudaProfilerStart() 56 | 57 | if do_eval: 58 | with torch.no_grad(): 59 | output = model(data) 60 | else: 61 | output = model(data) 62 | 63 | if batch_idx == 9: 64 | if profile == 'ncu': 65 | torch.cuda.nvtx.range_pop() 66 | elif profile == 'nsys': 67 | torch.cuda.profiler.cudart().cudaProfilerStop() 68 | 69 | batch_idx += 1 70 | 71 | print("Done!") 72 | 73 | if __name__ == "__main__": 74 | bnorm_loop(32, 0, False, 'nsys') -------------------------------------------------------------------------------- /profiling/benchmarks/conv.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | from torchvision import models, datasets, transforms 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import torch.nn.functional as F 11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 12 | from datetime import timedelta 13 | import random 14 | import numpy as np 15 | import time 16 | import os 17 | import argparse 18 | import threading 19 | 20 | class Model(torch.nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False) 24 | 25 | def forward(self, x): 26 | for i in range(25): 27 | y = self.conv(x) 28 | 29 | 30 | 31 | def conv_loop(batchsize, local_rank, do_eval=True, profile=None): 32 | 33 | print("-------------- thread id: ", threading.get_native_id()) 34 | 35 | data = torch.rand([batchsize, 3, 224, 224]).to(local_rank).contiguous() 36 | model = Model() 37 | model = model.to(0) 38 | 39 | if do_eval: 40 | model.eval() 41 | else: 42 | model.train() 43 | 44 | print("Enter loop!") 45 | 46 | batch_idx = 0 47 | torch.cuda.synchronize() 48 | 49 | while batch_idx < 10: 50 | 51 | if batch_idx == 9: 52 | if profile == 'ncu': 53 | torch.cuda.nvtx.range_push("start") 54 | elif profile == 'nsys': 55 | torch.cuda.profiler.cudart().cudaProfilerStart() 56 | 57 | if do_eval: 58 | with torch.no_grad(): 59 | output = model(data) 60 | else: 61 | output = model(data) 62 | 63 | if batch_idx == 9: 64 | if profile == 'ncu': 65 | torch.cuda.nvtx.range_pop() 66 | elif profile == 'nsys': 67 | torch.cuda.profiler.cudart().cudaProfilerStop() 68 | 69 | batch_idx += 1 70 | 71 | print("Done!") 72 | 73 | if __name__ == "__main__": 74 | conv_loop(32, 0, False, 'nsys') -------------------------------------------------------------------------------- /profiling/benchmarks/conv_bnorm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | from torchvision import models, datasets, transforms 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import torch.nn.functional as F 11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 12 | from datetime import timedelta 13 | import random 14 | import numpy as np 15 | import time 16 | import os 17 | import argparse 18 | import threading 19 | 20 | class Model(torch.nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False) 24 | self.bn = torch.nn.BatchNorm2d(64) 25 | 26 | def forward(self, x): 27 | for i in range(25): 28 | y = self.conv(x) 29 | z = self.bn(y) 30 | 31 | 32 | def conv_bnorm_loop(batchsize, local_rank, do_eval=True, profile=None): 33 | 34 | print("-------------- thread id: ", threading.get_native_id()) 35 | 36 | data = torch.rand([batchsize, 3, 224, 224]).to(local_rank).contiguous() 37 | model = Model() 38 | model = model.to(0) 39 | 40 | if do_eval: 41 | model.eval() 42 | else: 43 | model.train() 44 | 45 | print("Enter loop!") 46 | 47 | batch_idx = 0 48 | torch.cuda.synchronize() 49 | 50 | while batch_idx < 10: 51 | 52 | if batch_idx == 9: 53 | if profile == 'ncu': 54 | torch.cuda.nvtx.range_push("start") 55 | elif profile == 'nsys': 56 | torch.cuda.profiler.cudart().cudaProfilerStart() 57 | 58 | if do_eval: 59 | with torch.no_grad(): 60 | output = model(data) 61 | else: 62 | output = model(data) 63 | 64 | if batch_idx == 9: 65 | if profile == 'ncu': 66 | torch.cuda.nvtx.range_pop() 67 | elif profile == 'nsys': 68 | torch.cuda.profiler.cudart().cudaProfilerStop() 69 | 70 | batch_idx += 1 71 | 72 | print("Done!") 73 | 74 | if __name__ == "__main__": 75 | conv_bnorm_loop(32, 0, False, 'nsys') -------------------------------------------------------------------------------- /profiling/benchmarks/gnmt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import threading 3 | import time 4 | import sys 5 | 6 | sys.path.insert(0, f"{os.path.expanduser( '~' )}/DeepLearningExamples/PyTorch/Translation/GNMT") 7 | 8 | 9 | from seq2seq.models.gnmt import GNMT 10 | 11 | def gnmt(batchsize, local_rank, do_eval=True, profile=None): 12 | 13 | model_config = { 14 | 15 | "hidden_size": 1024, 16 | "vocab_size": 32320, 17 | "num_layers": 4, 18 | "dropout": 0.2, 19 | "batch_first": False, 20 | "share_embedding": True 21 | } 22 | 23 | input0 = torch.ones([50, batchsize]).to(torch.int64).to(0) 24 | input1 = torch.ones([batchsize]).to(torch.int64).to(0) 25 | input2 = torch.ones([50, batchsize]).to(torch.int64).to(0) 26 | labels = input2 27 | 28 | model = GNMT(**model_config).to(local_rank) 29 | 30 | if do_eval: 31 | model.eval() 32 | else: 33 | model.train() 34 | #criterion = LabelSmoothing(0.1, 0).to(local_rank) 35 | #optimizer = torch.optim.Adam(model.parameters(), lr=0.1) 36 | 37 | batch_idx = 0 38 | torch.cuda.synchronize() 39 | 40 | while batch_idx < 10: 41 | 42 | #if not do_eval: 43 | # optimizer.zero_grad() 44 | 45 | if batch_idx == 0: 46 | if profile == 'ncu': 47 | torch.cuda.nvtx.range_push("start") 48 | elif profile == 'nsys': 49 | torch.cuda.profiler.cudart().cudaProfilerStart() 50 | 51 | if do_eval: 52 | with torch.no_grad(): 53 | output = model(input0, input1, input2) 54 | else: 55 | output = model(input0, input1, input2) 56 | #T, B = output.size(0), output.size(1) 57 | #loss = criterion(output.view(T * B, -1), labels.contiguous().view(-1)) 58 | #loss.backward() 59 | #optimizer.step() 60 | 61 | 62 | if batch_idx == 9: 63 | if profile == 'ncu': 64 | torch.cuda.nvtx.range_pop() 65 | elif profile == 'nsys': 66 | torch.cuda.profiler.cudart().cudaProfilerStop() 67 | 68 | batch_idx += 1 69 | 70 | print("Done!") 71 | 72 | if __name__ == "__main__": 73 | gnmt(128, 0, False, 'nsys') 74 | -------------------------------------------------------------------------------- /profiling/benchmarks/retinanet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import threading 3 | import time 4 | import sys 5 | 6 | sys.path.append(f"{os.path.expanduser( '~' )}/mlcommons/single_stage_detector/ssd") 7 | from model.retinanet import retinanet_from_backbone 8 | 9 | def retinanet(batchsize, local_rank, do_eval=True, profile=None): 10 | 11 | model = retinanet_from_backbone( 12 | backbone="resnext50_32x4d", 13 | num_classes=264, 14 | image_size=[800, 800], 15 | data_layout='channels_last', 16 | pretrained=False, 17 | trainable_backbone_layers=3).cuda() 18 | images = [torch.ones((3,768,1024)).to(torch.float32).cuda() for _ in range(batchsize)] 19 | # just a dummy example 20 | targets = [ 21 | { 22 | 'boxes': torch.tensor([[ 3.8400, 42.2873, 597.1200, 660.5751], 23 | [ 367.3600, 2.5626, 1008.6400, 682.3594]]).cuda(), 24 | 'labels': torch.tensor([148, 257]).cuda(), 25 | 'image_id': torch.tensor([299630]).cuda(), 26 | 'area': torch.tensor([366817.7812, 435940.0625]).cuda(), 27 | 'iscrowd': torch.tensor([0, 0]).cuda(), 28 | } 29 | for _ in range(batchsize) 30 | ] 31 | 32 | if do_eval: 33 | model.eval() 34 | else: 35 | model.train() 36 | params = [p for p in model.parameters() if p.requires_grad] 37 | optimizer = torch.optim.Adam(params, lr=0.1) 38 | 39 | batch_idx = 0 40 | 41 | while batch_idx < 10: 42 | 43 | print(f"run {batch_idx}") 44 | 45 | if batch_idx == 9: 46 | if profile == 'ncu': 47 | torch.cuda.nvtx.range_push("start") 48 | elif profile == 'nsys': 49 | torch.cuda.profiler.cudart().cudaProfilerStart() 50 | 51 | if do_eval: 52 | with torch.no_grad(): 53 | output = model(images) 54 | else: 55 | optimizer.zero_grad() 56 | loss_dict = model(images, targets) 57 | losses = sum(loss for loss in loss_dict.values()) 58 | losses.backward() 59 | optimizer.step() 60 | 61 | if batch_idx == 9: 62 | if profile == 'ncu': 63 | torch.cuda.nvtx.range_pop() 64 | elif profile == 'nsys': 65 | torch.cuda.profiler.cudart().cudaProfilerStop() 66 | 67 | batch_idx += 1 68 | 69 | print("Done!") 70 | 71 | if __name__ == "__main__": 72 | retinanet(4, 0, True, 'nsys') 73 | -------------------------------------------------------------------------------- /profiling/benchmarks/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import threading 3 | import time 4 | import sys 5 | 6 | sys.path.append(f"{os.path.expanduser( '~' )}/DeepLearningExamples/PyTorch/LanguageModeling/Transformer-XL/pytorch") 7 | 8 | from mem_transformer import MemTransformerLM 9 | import lamb 10 | 11 | def transformer(batchsize, local_rank, do_eval=True, profile=None): 12 | 13 | model_config = { 14 | 'n_token': 267735, 15 | 'n_layer': 16, 16 | 'n_head': 8, 17 | 'd_model': 512, 18 | 'd_head': 64, 19 | 'd_inner': 2048, 20 | 'dropout': 0.1, 21 | 'dropatt': 0.0, 22 | 'dtype': None, 23 | 'tie_weight': True, 24 | 'd_embed': 512, 25 | 'div_val': 1, 26 | 'tie_projs': [False, True, True, True], 27 | 'pre_lnorm': False, 28 | 'tgt_len': 192, 29 | 'ext_len': 0, 30 | 'mem_len': 192, 31 | 'cutoffs': [19997, 39997, 199997], 32 | 'same_length': False, 33 | 'attn_type': 0, 34 | 'clamp_len': -1, 35 | 'sample_softmax': -1 36 | } 37 | 38 | data = torch.ones((192, batchsize)).to(torch.int64).cuda() 39 | target = torch.ones((192, batchsize)).to(torch.int64).cuda() 40 | 41 | model = MemTransformerLM(**model_config).to(0) 42 | 43 | if do_eval: 44 | model.eval() 45 | else: 46 | model.train() 47 | optimizer = lamb.Lamb(model.parameters(), lr=0.1) 48 | 49 | torch.cuda.synchronize() 50 | batch_idx = 0 51 | mems = None 52 | 53 | while batch_idx < 10: 54 | 55 | start_iter = time.time() 56 | if batch_idx == 0: 57 | if profile == 'ncu': 58 | torch.cuda.nvtx.range_push("start") 59 | elif profile == 'nsys': 60 | torch.cuda.profiler.cudart().cudaProfilerStart() 61 | 62 | if do_eval: 63 | with torch.no_grad(): 64 | output = model(data, target, mems) 65 | else: 66 | optimizer.zero_grad() 67 | loss, mems = model(data, target, mems) 68 | loss = loss.float().mean().type_as(loss) 69 | loss.backward() 70 | optimizer.step() 71 | 72 | if batch_idx == 9: 73 | if profile == 'ncu': 74 | torch.cuda.nvtx.range_pop() 75 | elif profile == 'nsys': 76 | torch.cuda.profiler.cudart().cudaProfilerStop() 77 | 78 | batch_idx += 1 79 | print(f"It took {time.time()-start_iter} sec") 80 | 81 | print("Done!") 82 | 83 | if __name__ == "__main__": 84 | transformer(4, 0, True, 'ncu') 85 | -------------------------------------------------------------------------------- /profiling/benchmarks/vision_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from platform import node 3 | import sched 4 | import sys 5 | import torch 6 | import torch.distributed as dist 7 | import torch.multiprocessing as mp 8 | import torchvision 9 | from torchvision import models, datasets, transforms 10 | from torch.nn.parallel import DistributedDataParallel as DDP 11 | import torch.nn.functional as F 12 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock 13 | from datetime import timedelta 14 | import random 15 | import numpy as np 16 | import time 17 | import os 18 | import argparse 19 | import threading 20 | 21 | print(torchvision.__file__) 22 | 23 | 24 | def vision(model_name, batchsize, local_rank, do_eval=True, profile=None): 25 | 26 | data = torch.ones([batchsize, 3, 224, 224], pin_memory=True).to(local_rank) 27 | target = torch.ones([batchsize], pin_memory=True).to(torch.long).to(local_rank) 28 | #data = torch.rand([batchsize, 2048]).to(local_rank) 29 | model = models.__dict__[model_name](num_classes=1000) 30 | model = model.to(local_rank) 31 | 32 | ''' 33 | train_dir = "/mnt/data/home/fot/imagenet/imagenet-raw-euwest4/" 34 | train_transform = transforms.Compose([ 35 | transforms.RandomResizedCrop(224), 36 | transforms.RandomHorizontalFlip(), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))]) 39 | train_dataset = \ 40 | datasets.ImageFolder(train_dir,transform=train_transform) 41 | 42 | train_loader = torch.utils.data.DataLoader( 43 | train_dataset, batch_size=batchsize, num_workers=8) 44 | 45 | train_iter = enumerate(train_loader) 46 | ''' 47 | 48 | if do_eval: 49 | model.eval() 50 | else: 51 | model.train() 52 | optimizer = torch.optim.SGD(model.parameters(), lr=0.1) 53 | criterion = torch.nn.CrossEntropyLoss().to(local_rank) 54 | 55 | batch_idx = 0 56 | torch.cuda.synchronize() 57 | start = time.time() 58 | 59 | 60 | for batch_idx in range(1000): #batch in train_iter: 61 | 62 | #data, target = batch[0].to(local_rank), batch[1].to(local_rank) 63 | start = time.time() 64 | if batch_idx == 9: 65 | if profile == 'ncu': 66 | torch.cuda.nvtx.range_push("start") 67 | elif profile == 'nsys': 68 | torch.cuda.profiler.cudart().cudaProfilerStart() 69 | if do_eval: 70 | with torch.no_grad(): 71 | output = model(data) 72 | else: 73 | optimizer.zero_grad() 74 | output = model(data) 75 | loss = criterion(output, target) 76 | loss.backward() 77 | optimizer.step() 78 | 79 | torch.cuda.synchronize() 80 | if batch_idx == 9: 81 | if profile == 'ncu': 82 | torch.cuda.nvtx.range_pop() 83 | elif profile == 'nsys': 84 | torch.cuda.profiler.cudart().cudaProfilerStop() 85 | #batch_idx += 1 86 | 87 | print(f"Iteration took {time.time()-start} sec") 88 | 89 | print(f"Done!, It took {time.time()-start_all} sec") 90 | 91 | if __name__ == "__main__": 92 | vision('mobilenet_v2', 4, 0, True, 'ncu') 93 | -------------------------------------------------------------------------------- /profiling/postprocessing/get_num_blocks.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from math import ceil, floor 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--results_dir', type=str, required=True, 7 | help='path to directory containing the profiling files') 8 | parser.add_argument('--max_threads_sm', type=int, default=2048, 9 | help='maximum number of threads that can be active in an SM') 10 | parser.add_argument('--max_blocks_sm', type=int, default=80, 11 | help='maximum number of blocks that can be active in an SM') 12 | parser.add_argument('--max_shmem_sm', type=int, default=65536, 13 | help='maximum amount of shared memory (in bytes) per SM') 14 | parser.add_argument('--max_regs_sm', type=int, default=65536, 15 | help='maximum number of registers per SM') 16 | args = parser.parse_args() 17 | 18 | df = pd.read_csv(f'{args.results_dir}/output_ncu_processed.csv', index_col=0) 19 | 20 | max_threads_sm = args.max_threads_sm 21 | max_blocks_sm = args.max_blocks_sm 22 | max_shmem_sm = args.max_shmem_sm 23 | max_regs_sm = args.max_regs_sm 24 | 25 | sm_needed = [] 26 | 27 | for index, row in df.iterrows(): 28 | num_blocks = row['Grid'] 29 | num_threads = row['Number_of_threads'] 30 | threads_per_block = row['Block'] 31 | shmem_per_block = row['Static_shmem_per_block'] 32 | regs_per_thread = row['Registers_Per_Thread'] 33 | 34 | # from threads 35 | blocks_per_sm_threads = ceil(max_threads_sm/threads_per_block) 36 | 37 | # from shmem 38 | if shmem_per_block > 0: 39 | blocks_per_sm_shmem = ceil(max_shmem_sm/shmem_per_block) 40 | else: 41 | blocks_per_sm_shmem = blocks_per_sm_threads 42 | 43 | # from registers 44 | regs_per_wrap = ceil(32*regs_per_thread/256) * 256 45 | wraps_per_sm = floor((65536/regs_per_wrap)/4) * 4 46 | wraps_per_block = ceil(threads_per_block/32) 47 | blocks_per_sm_regs = int(wraps_per_sm/wraps_per_block) 48 | 49 | blocks_per_sm = min(blocks_per_sm_threads, blocks_per_sm_shmem, blocks_per_sm_regs) 50 | sm_needed_kernel = ceil(num_blocks/blocks_per_sm) 51 | 52 | #print(blocks_per_sm, sm_needed_kernel) 53 | sm_needed.append(sm_needed_kernel) 54 | 55 | 56 | less = [x for x in sm_needed if x < 108] 57 | print(len(less), len(sm_needed)) 58 | 59 | df['SM_needed'] = sm_needed 60 | #print(df) 61 | df.to_csv(f'{args.results_dir}/output_ncu_sms.csv', index=0) 62 | -------------------------------------------------------------------------------- /profiling/postprocessing/process_ncu.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('--results_dir', type=str, required=True, 6 | help='path to directory containing the profiling files') 7 | args = parser.parse_args() 8 | 9 | df = pd.read_csv(f'{args.results_dir}/output_ncu.csv', index_col=0) 10 | kernels = [] 11 | metrics_to_get = ['Duration', 'Block Size', 'Grid Size', 'Compute (SM) [%]', 'DRAM Throughput', 'Registers Per Thread', 'Static Shared Memory Per Block'] 12 | 13 | unique_kernel_names = set() 14 | 15 | for index, row in df.iterrows(): 16 | kernel = row['Kernel Name'] 17 | metric_name = row['Metric Name'] 18 | 19 | if metric_name == 'DRAM Frequency': 20 | kernels.append([kernel]) 21 | unique_kernel_names.add(kernel) 22 | elif metric_name in metrics_to_get: 23 | kernels[-1].append(row['Metric Value']) 24 | 25 | for x in unique_kernel_names: 26 | print(x) 27 | print("------------------------------------") 28 | 29 | 30 | for kernel in kernels: 31 | num_threads = int(kernel[-2]) * int(kernel[-3]) 32 | num_registers = num_threads * int(kernel[-1]) 33 | kernel += [num_threads, num_registers] 34 | 35 | 36 | print(len(kernels)) 37 | #print(kernels[0]) 38 | labels = ['Kernel_Name', 'DRAM_Throughput(%)', 'Duration(ns)', 'Compute(SM)(%)', 'Block', 'Grid', 'Registers_Per_Thread', 'Static_shmem_per_block', 'Number_of_threads', 'Number_of_registers'] 39 | 40 | 41 | 42 | df_new = pd.DataFrame(kernels, columns=labels) 43 | print(df_new) 44 | df_new.to_csv(f'{args.results_dir}/output_ncu_processed.csv') 45 | -------------------------------------------------------------------------------- /profiling/postprocessing/roofline_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('--results_dir', type=str, required=True, 6 | help='path to directory containing the profiling files') 7 | parser.add_argument('--ai_threshold', type=float, default=9.72, 8 | help='arithmetic intensity that seperates compute from memory bound kernels') 9 | args = parser.parse_args() 10 | 11 | df_raw = pd.read_csv(f'{args.results_dir}/raw_ncu.csv') 12 | 13 | startp = 0 14 | df_raw = df_raw.iloc[startp:] 15 | 16 | l = list(df_raw.iloc[0]) 17 | print(l) 18 | df_basic = pd.read_csv(f'{args.results_dir}/output_ncu_sms.csv', index_col=0) 19 | 20 | 21 | dram_throughput = df_basic['DRAM_Throughput(%)'] 22 | comp_throughput = df_basic['Compute(SM)(%)'] 23 | 24 | fadd = 'smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsed [inst/cycle]' 25 | fmul = 'smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsed [inst/cycle]' 26 | ffma = 'smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed [inst/cycle]' 27 | cycles_sec = 'smsp__cycles_elapsed.avg.per_second [cycle/nsecond]' 28 | bytes_sec = 'dram__bytes.sum.per_second [Gbyte/second]' 29 | 30 | ai_list = [] 31 | roofline_prof = [] # 1: comp, 0: mem, -1: invalid 32 | 33 | comp_bound = 0 34 | mem_bound = 0 35 | rest = 0 36 | 37 | for index, row in df_raw.iterrows(): 38 | add = str(row[fadd]) 39 | mul = str(row[fmul]) 40 | fma = row[ffma] 41 | cycles = row[cycles_sec] 42 | bytes = row[bytes_sec] 43 | #print(add, mul, fma, cycles, bytes) 44 | 45 | if not isinstance(fma, float): 46 | fma = float(fma.replace("'", '')) 47 | add = float(add.replace("'", '')) 48 | mul = float(mul.replace("'", '')) 49 | 50 | 51 | if add or mul or fma: 52 | flops_cycle = add+mul+fma*2 53 | flops_sec = flops_cycle * cycles 54 | ai = flops_sec/bytes 55 | ai_list.append(ai) 56 | print(index, ai) 57 | if ai > args.ai_threshold: 58 | roofline_prof.append(1) 59 | comp_bound += 1 60 | else: 61 | roofline_prof.append(0) 62 | mem_bound += 1 63 | else: 64 | ai_list.append(0.0) 65 | if comp_throughput[index-startp] >= 60.0: 66 | roofline_prof.append(1) 67 | elif dram_throughput[index-startp] >= 60.0: 68 | roofline_prof.append(0) 69 | else: 70 | roofline_prof.append(-1) 71 | rest += 1 72 | 73 | 74 | print(df_basic) 75 | df_basic['AI(flops/bytes)'] = ai_list 76 | df_basic['Roofline_prof'] = roofline_prof 77 | df_basic.to_csv(f'{args.results_dir}/output_ncu_sms_roofline.csv') 78 | 79 | print(f"comp bound: {comp_bound}, mem bound: {mem_bound}, rest: {rest}, total: {comp_bound+mem_bound+rest}") 80 | -------------------------------------------------------------------------------- /related/Tick-Tock/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "duration0": 96.88879942893982, 3 | "duration": 96.88879942893982 4 | } -------------------------------------------------------------------------------- /related/baselines/README.md: -------------------------------------------------------------------------------- 1 | # GPU Sharing Baselines 2 | This directory contains evaluations of GPU sharing techniques between two workloads. 3 | Supported baselines are `MPS`, `TickTock`, `Streams`, `Isolated`, and `Sequential`. 4 | 5 | [main.py](./main.py) is the entry point of the evaluation and the all configurations are in [config.yaml](./config.yaml). 6 | 7 | To evaluate a baseline, change the `policy` field in `config.yaml` to the baseline name. 8 | Then, run `python main.py --config config.yaml`. 9 | 10 | If no `--config` argument is provided, [config.yaml](./config.yaml) is used by default. 11 | 12 | 13 | ## Supported Baselines 14 | ### MPS 15 | MPS: [Multi-Process Service (MPS)](https://docs.nvidia.com/deploy/mps/index.html) is a feature of NVIDIA GPUs that allows multiple processes to share a single GPU. 16 | 17 | **Caveat!** There are extra steps to do before executing the python program: 18 | 1. Execute `./start_MPS_control_daemon.sh` to start the MPS server. 19 | 2. Export these two environment variables: 20 | ```shell 21 | export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps 22 | export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log 23 | ``` 24 | 3. Within the same shell session where you exported the environment variables, execute the python program normally. 25 | 26 | ### TICK-TOCK scheduling 27 | 28 | This directory contains a basic implementation of TICK-TOCK scheduling using Python threads, and torch.cuda streams and events. 29 | It is based on the description provided in [WAVELET: EFFICIENT DNN TRAINING WITH TICK-TOCK SCHEDULING (MLSys'21)](https://proceedings.mlsys.org/paper/2021/file/c81e728d9d4c2f636f067f89cc14862c-Paper.pdf). 30 | 31 | What would be an interesting next step is implementing the memory management support described in [Zico: Efficient GPU Memory Sharing for 32 | Concurrent DNN Training (ATC'21)](https://www.usenix.org/system/files/atc21-lim.pdf). 33 | 34 | ### Streams 35 | GPU Streams provide a way to execute workloads concurrently on a single GPU. 36 | One stream captures a linear sequence of operations to be executed, and multiple streams can be executed concurrently. 37 | 38 | ### Sequential 39 | `Sequential` represents the temporal sharing baseline where the GPU is time-sliced between the two workloads. 40 | 41 | ### Isolated 42 | To analyze the overhead of GPU sharing, we compare the performance of GPU sharing with the performance of executing 43 | the workload on a single GPU without sharing. For `Isolated` we first execute workload A and then workload B after A is finished. 44 | -------------------------------------------------------------------------------- /related/baselines/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/bert/__init__.py -------------------------------------------------------------------------------- /related/baselines/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | policy: "MPS" # "MPS", "TickTock", "Streams", "Isolated", or "Sequential" 3 | models: 4 | model0: 5 | mode: eval # train or eval 6 | name: resnet50 # these two names should strictly correspond to the model names below 7 | model1: 8 | mode: train # train or eval 9 | name: mobilenet_v2 10 | shared_config: 11 | distribution: poisson # poisson, uniform, or trace 12 | trace_path: './inter_arrival_times.json' # only used when distribution is trace 13 | pin_memory: true 14 | seed: 42 15 | 16 | # configuration for each model 17 | resnet50: 18 | arch: resnet50 19 | batch_size: 4 20 | num_iterations: 100 21 | request_rate: 15 # measured in 1/seconds. If 0 it means no sleep 22 | resnet101: 23 | arch: resnet101 24 | batch_size: 32 25 | num_iterations: 500 26 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 27 | mobilenet_v2: 28 | arch: mobilenet_v2 29 | batch_size: 64 30 | num_iterations: 10000 31 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 32 | bert: 33 | batch_size: 8 34 | arch: base # either base or large 35 | num_iterations: 500 36 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 37 | # large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16' 38 | # base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12' 39 | transformer: 40 | arch: base # either base or large 41 | batch_size: 8 42 | num_iterations: 500 43 | request_rate: 0 # measured in 1/seconds. If 0 it means no sleep 44 | 45 | 46 | 47 | 48 | resnet50-1: 49 | arch: resnet50 50 | batch_size: 32 51 | num_iterations: 1000 52 | request_rate: 80 # measured in 1/seconds. If 0 it means no sleep 53 | resnet101-1: 54 | arch: resnet101 55 | batch_size: 32 56 | num_iterations: 1000 57 | request_rate: 40 # measured in 1/seconds. If 0 it means no sleep 58 | mobilenet_v2-1: 59 | arch: mobilenet_v2 60 | batch_size: 64 61 | num_iterations: 1000 62 | request_rate: 100 # measured in 1/seconds. If 0 it means no sleep 63 | bert-1: 64 | batch_size: 8 65 | arch: base # either base or large 66 | num_iterations: 1000 67 | request_rate: 8 # measured in 1/seconds. If 0 it means no sleep 68 | # large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16' 69 | # base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12' 70 | transformer-1: 71 | arch: base # either base or large 72 | batch_size: 8 73 | num_iterations: 1000 74 | request_rate: 20 # measured in 1/seconds. If 0 it means no sleep 75 | -------------------------------------------------------------------------------- /related/baselines/dcgan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/dcgan/__init__.py -------------------------------------------------------------------------------- /related/baselines/dcgan/dcgan.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | # code from https://github.com/pytorch/examples/blob/main/dcgan/main.py 5 | 6 | def weights_init(m): 7 | classname = m.__class__.__name__ 8 | if classname.find('Conv') != -1: 9 | torch.nn.init.normal_(m.weight, 0.0, 0.02) 10 | elif classname.find('BatchNorm') != -1: 11 | torch.nn.init.normal_(m.weight, 1.0, 0.02) 12 | torch.nn.init.zeros_(m.bias) 13 | 14 | # only training on one gpu 15 | # ngf = number of filters in the generator 16 | # nz = size of the latent z vector 17 | # nc = number of channels 18 | class Generator(nn.Module): 19 | def __init__(self, ngf: int, nc: int, nz: int): 20 | super(Generator, self).__init__() 21 | self.main = nn.Sequential( 22 | # input is Z, going into a convolution 23 | nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False), 24 | nn.BatchNorm2d(ngf * 8), 25 | nn.ReLU(True), 26 | # state size. (ngf*8) x 4 x 4 27 | nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False), 28 | nn.BatchNorm2d(ngf * 4), 29 | nn.ReLU(True), 30 | # state size. (ngf*4) x 8 x 8 31 | nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False), 32 | nn.BatchNorm2d(ngf * 2), 33 | nn.ReLU(True), 34 | # state size. (ngf*2) x 16 x 16 35 | nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False), 36 | nn.BatchNorm2d(ngf), 37 | nn.ReLU(True), 38 | # state size. (ngf) x 32 x 32 39 | nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False), 40 | nn.Tanh() 41 | # state size. (nc) x 64 x 64 42 | ) 43 | 44 | def forward(self, input): 45 | output = self.main(input) 46 | return output 47 | 48 | # ndf = number of filters in the discriminator 49 | # nc = number of channels 50 | class Discriminator(nn.Module): 51 | def __init__(self, ndf, nc): 52 | super(Discriminator, self).__init__() 53 | self.main = nn.Sequential( 54 | # input is (nc) x 64 x 64 55 | nn.Conv2d(nc, ndf, 4, 2, 1, bias=False), 56 | nn.LeakyReLU(0.2, inplace=True), 57 | # state size. (ndf) x 32 x 32 58 | nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False), 59 | nn.BatchNorm2d(ndf * 2), 60 | nn.LeakyReLU(0.2, inplace=True), 61 | # state size. (ndf*2) x 16 x 16 62 | nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False), 63 | nn.BatchNorm2d(ndf * 4), 64 | nn.LeakyReLU(0.2, inplace=True), 65 | # state size. (ndf*4) x 8 x 8 66 | nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False), 67 | nn.BatchNorm2d(ndf * 8), 68 | nn.LeakyReLU(0.2, inplace=True), 69 | # state size. (ndf*8) x 4 x 4 70 | nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False), 71 | nn.Sigmoid() 72 | ) 73 | 74 | def forward(self, input): 75 | output = self.main(input) 76 | return output.view(-1, 1).squeeze(1) 77 | 78 | -------------------------------------------------------------------------------- /related/baselines/gnmt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/gnmt/__init__.py -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/data/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Elad Hoffer 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | # SOFTWARE. 21 | 22 | PAD_TOKEN = '' 23 | UNK_TOKEN = '' 24 | BOS_TOKEN = '' 25 | EOS_TOKEN = '<\s>' 26 | 27 | # special PAD, UNKNOWN, BEGIN-OF-STRING, END-OF-STRING tokens 28 | PAD, UNK, BOS, EOS = [0, 1, 2, 3] 29 | 30 | # path to the moses detokenizer, relative to the data directory 31 | DETOKENIZER = 'mosesdecoder/scripts/tokenizer/detokenizer.perl' 32 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/gpu_affinity.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import math 3 | import os 4 | import pathlib 5 | import re 6 | 7 | import pynvml 8 | 9 | pynvml.nvmlInit() 10 | 11 | 12 | def systemGetDriverVersion(): 13 | return pynvml.nvmlSystemGetDriverVersion() 14 | 15 | 16 | def deviceGetCount(): 17 | return pynvml.nvmlDeviceGetCount() 18 | 19 | 20 | class device: 21 | # assume nvml returns list of 64 bit ints 22 | _nvml_affinity_elements = math.ceil(os.cpu_count() / 64) 23 | 24 | def __init__(self, device_idx): 25 | super().__init__() 26 | self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx) 27 | 28 | def getName(self): 29 | return pynvml.nvmlDeviceGetName(self.handle) 30 | 31 | def getCpuAffinity(self): 32 | affinity_string = '' 33 | for j in pynvml.nvmlDeviceGetCpuAffinity( 34 | self.handle, device._nvml_affinity_elements 35 | ): 36 | # assume nvml returns list of 64 bit ints 37 | affinity_string = '{:064b}'.format(j) + affinity_string 38 | affinity_list = [int(x) for x in affinity_string] 39 | affinity_list.reverse() # so core 0 is in 0th element of list 40 | 41 | ret = [i for i, e in enumerate(affinity_list) if e != 0] 42 | return ret 43 | 44 | 45 | def set_socket_affinity(gpu_id): 46 | dev = device(gpu_id) 47 | affinity = dev.getCpuAffinity() 48 | os.sched_setaffinity(0, affinity) 49 | 50 | 51 | def set_single_affinity(gpu_id): 52 | dev = device(gpu_id) 53 | affinity = dev.getCpuAffinity() 54 | os.sched_setaffinity(0, affinity[:1]) 55 | 56 | 57 | def set_single_unique_affinity(gpu_id, nproc_per_node): 58 | devices = [device(i) for i in range(nproc_per_node)] 59 | socket_affinities = [dev.getCpuAffinity() for dev in devices] 60 | 61 | siblings_list = get_thread_siblings_list() 62 | siblings_dict = dict(siblings_list) 63 | 64 | # remove siblings 65 | for idx, socket_affinity in enumerate(socket_affinities): 66 | socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values())) 67 | 68 | affinities = [] 69 | assigned = [] 70 | 71 | for socket_affinity in socket_affinities: 72 | for core in socket_affinity: 73 | if core not in assigned: 74 | affinities.append([core]) 75 | assigned.append(core) 76 | break 77 | os.sched_setaffinity(0, affinities[gpu_id]) 78 | 79 | 80 | def set_socket_unique_affinity(gpu_id, nproc_per_node, mode): 81 | device_ids = [device(i) for i in range(nproc_per_node)] 82 | socket_affinities = [dev.getCpuAffinity() for dev in device_ids] 83 | 84 | siblings_list = get_thread_siblings_list() 85 | siblings_dict = dict(siblings_list) 86 | 87 | # remove siblings 88 | for idx, socket_affinity in enumerate(socket_affinities): 89 | socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values())) 90 | 91 | socket_affinities_to_device_ids = collections.defaultdict(list) 92 | 93 | for idx, socket_affinity in enumerate(socket_affinities): 94 | socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx) 95 | 96 | for socket_affinity, device_ids in socket_affinities_to_device_ids.items(): 97 | devices_per_group = len(device_ids) 98 | cores_per_device = len(socket_affinity) // devices_per_group 99 | for group_id, device_id in enumerate(device_ids): 100 | if device_id == gpu_id: 101 | if mode == 'interleaved': 102 | affinity = list(socket_affinity[group_id::devices_per_group]) 103 | elif mode == 'continuous': 104 | affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device]) 105 | else: 106 | raise RuntimeError('Unknown set_socket_unique_affinity mode') 107 | 108 | # reintroduce siblings 109 | affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict] 110 | os.sched_setaffinity(0, affinity) 111 | 112 | 113 | def get_thread_siblings_list(): 114 | path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list' 115 | thread_siblings_list = [] 116 | pattern = re.compile(r'(\d+)\D(\d+)') 117 | for fname in pathlib.Path(path[0]).glob(path[1:]): 118 | with open(fname) as f: 119 | content = f.read().strip() 120 | res = pattern.findall(content) 121 | if res: 122 | pair = tuple(map(int, res[0])) 123 | thread_siblings_list.append(pair) 124 | return thread_siblings_list 125 | 126 | 127 | def set_affinity(gpu_id, nproc_per_node, mode='socket'): 128 | if mode == 'socket': 129 | set_socket_affinity(gpu_id) 130 | elif mode == 'single': 131 | set_single_affinity(gpu_id) 132 | elif mode == 'single_unique': 133 | set_single_unique_affinity(gpu_id, nproc_per_node) 134 | elif mode == 'socket_unique_interleaved': 135 | set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved') 136 | elif mode == 'socket_unique_continuous': 137 | set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous') 138 | else: 139 | raise RuntimeError('Unknown affinity mode') 140 | 141 | affinity = os.sched_getaffinity(0) 142 | return affinity 143 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/inference/tables.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import collections 22 | import itertools 23 | 24 | import numpy as np 25 | from pytablewriter import MarkdownTableWriter 26 | 27 | 28 | def interleave(*args): 29 | return list(itertools.chain(*zip(*args))) 30 | 31 | 32 | class AccuracyTable: 33 | def __init__(self, unit): 34 | self.data = collections.defaultdict(dict) 35 | self.unit = unit 36 | 37 | def add(self, key, data): 38 | self.data[key].update(data) 39 | 40 | def write(self, title, write_math): 41 | writer = MarkdownTableWriter() 42 | writer.table_name = f'{title}' 43 | main_header = ['**Batch Size**', '**Beam Size**'] 44 | data_header = [] 45 | if 'fp32' in write_math: 46 | data_header += [f'**Accuracy - FP32 ({self.unit})**'] 47 | if 'tf32' in write_math: 48 | data_header += [f'**Accuracy - TF32 ({self.unit})**'] 49 | if 'fp16' in write_math: 50 | data_header += [f'**Accuracy - FP16 ({self.unit})**'] 51 | writer.headers = main_header + data_header 52 | 53 | writer.value_matrix = [] 54 | for k, v in self.data.items(): 55 | batch_size, beam_size = k 56 | row = [batch_size, beam_size] 57 | if 'fp32' in write_math: 58 | row.append(v['fp32']) 59 | if 'tf32' in write_math: 60 | row.append(v['tf32']) 61 | if 'fp16' in write_math: 62 | row.append(v['fp16']) 63 | writer.value_matrix.append(row) 64 | writer.write_table() 65 | 66 | 67 | class PerformanceTable: 68 | def __init__(self, percentiles, unit, reverse_percentiles=False): 69 | self.percentiles = percentiles 70 | self.data = collections.defaultdict(dict) 71 | self.unit = unit 72 | self.reverse_percentiles = reverse_percentiles 73 | 74 | def add(self, key, value): 75 | math, value = next(iter(value.items())) 76 | value = np.array(value) 77 | 78 | if self.reverse_percentiles: 79 | percentiles = [100 - p for p in self.percentiles] 80 | else: 81 | percentiles = self.percentiles 82 | 83 | stats = [] 84 | for p in percentiles: 85 | val = np.percentile(value, p) 86 | stats.append(val * self.unit_convert[self.unit]) 87 | 88 | avg = value.mean() * self.unit_convert[self.unit] 89 | 90 | self.data[key].update({math: (avg, stats)}) 91 | 92 | def write(self, title, math, relative=None, reverse_speedup=False): 93 | writer = MarkdownTableWriter() 94 | writer.table_name = f'{title} - {math.upper()}' 95 | main_header = ['**Batch Size**', '**Beam Size**'] 96 | data_header = [f'**Avg ({self.unit})**'] 97 | data_header += [f'**{p}% ({self.unit})**' for p in self.percentiles] 98 | 99 | if relative: 100 | speedup_header = ['**Speedup**'] * len(data_header) 101 | data_header = interleave(data_header, speedup_header) 102 | 103 | writer.headers = main_header + data_header 104 | 105 | writer.value_matrix = [] 106 | for k, v in self.data.items(): 107 | batch_size, beam_size = k 108 | avg, res_percentiles = v[math] 109 | main = [batch_size, beam_size] 110 | data = [avg, *res_percentiles] 111 | 112 | if relative: 113 | rel = self.data[k][relative] 114 | rel_avg, rel_res_percentiles = rel 115 | rel = [rel_avg, *rel_res_percentiles] 116 | speedup = [d / r for (r, d) in zip(rel, data)] 117 | if reverse_speedup: 118 | speedup = [1 / s for s in speedup] 119 | data = interleave(data, speedup) 120 | 121 | writer.value_matrix.append(main + data) 122 | writer.write_table() 123 | 124 | 125 | class LatencyTable(PerformanceTable): 126 | def __init__(self, percentiles, unit='ms'): 127 | super().__init__(percentiles, unit) 128 | self.unit_convert = {'s': 1, 'ms': 1e3, 'us': 1e6} 129 | 130 | 131 | class ThroughputTable(PerformanceTable): 132 | def __init__(self, percentiles, unit='tok/s', reverse_percentiles=True): 133 | super().__init__(percentiles, unit, reverse_percentiles) 134 | self.unit_convert = {'tok/s': 1} 135 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/models/encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Elad Hoffer 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | # SOFTWARE. 21 | 22 | import torch.nn as nn 23 | from torch.nn.utils.rnn import pack_padded_sequence 24 | from torch.nn.utils.rnn import pad_packed_sequence 25 | 26 | import gnmt.seq2seq.data.config as config 27 | from gnmt.seq2seq.utils import init_lstm_ 28 | 29 | 30 | class ResidualRecurrentEncoder(nn.Module): 31 | """ 32 | Encoder with Embedding, LSTM layers, residual connections and optional 33 | dropout. 34 | 35 | The first LSTM layer is bidirectional and uses variable sequence length 36 | API, the remaining (num_layers-1) layers are unidirectional. Residual 37 | connections are enabled after third LSTM layer, dropout is applied on 38 | inputs to LSTM layers. 39 | """ 40 | def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2, 41 | batch_first=False, embedder=None, init_weight=0.1): 42 | """ 43 | Constructor for the ResidualRecurrentEncoder. 44 | 45 | :param vocab_size: size of vocabulary 46 | :param hidden_size: hidden size for LSTM layers 47 | :param num_layers: number of LSTM layers, 1st layer is bidirectional 48 | :param dropout: probability of dropout (on input to LSTM layers) 49 | :param batch_first: if True the model uses (batch,seq,feature) tensors, 50 | if false the model uses (seq, batch, feature) 51 | :param embedder: instance of nn.Embedding, if None constructor will 52 | create new embedding layer 53 | :param init_weight: range for the uniform initializer 54 | """ 55 | super(ResidualRecurrentEncoder, self).__init__() 56 | self.batch_first = batch_first 57 | self.rnn_layers = nn.ModuleList() 58 | # 1st LSTM layer, bidirectional 59 | self.rnn_layers.append( 60 | nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True, 61 | batch_first=batch_first, bidirectional=True)) 62 | 63 | # 2nd LSTM layer, with 2x larger input_size 64 | self.rnn_layers.append( 65 | nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True, 66 | batch_first=batch_first)) 67 | 68 | # Remaining LSTM layers 69 | for _ in range(num_layers - 2): 70 | self.rnn_layers.append( 71 | nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True, 72 | batch_first=batch_first)) 73 | 74 | for lstm in self.rnn_layers: 75 | init_lstm_(lstm, init_weight) 76 | 77 | self.dropout = nn.Dropout(p=dropout) 78 | 79 | if embedder is not None: 80 | self.embedder = embedder 81 | else: 82 | self.embedder = nn.Embedding(vocab_size, hidden_size, 83 | padding_idx=config.PAD) 84 | nn.init.uniform_(self.embedder.weight.data, -init_weight, 85 | init_weight) 86 | 87 | def forward(self, inputs, lengths): 88 | """ 89 | Execute the encoder. 90 | 91 | :param inputs: tensor with indices from the vocabulary 92 | :param lengths: vector with sequence lengths (excluding padding) 93 | 94 | returns: tensor with encoded sequences 95 | """ 96 | x = self.embedder(inputs) 97 | 98 | # bidirectional layer 99 | x = self.dropout(x) 100 | x = pack_padded_sequence(x, lengths.cpu(), 101 | batch_first=self.batch_first) 102 | x, _ = self.rnn_layers[0](x) 103 | x, _ = pad_packed_sequence(x, batch_first=self.batch_first) 104 | 105 | # 1st unidirectional layer 106 | x = self.dropout(x) 107 | x, _ = self.rnn_layers[1](x) 108 | 109 | # the rest of unidirectional layers, 110 | # with residual connections starting from 3rd layer 111 | for i in range(2, len(self.rnn_layers)): 112 | residual = x 113 | x = self.dropout(x) 114 | x, _ = self.rnn_layers[i](x) 115 | x = x + residual 116 | 117 | return x 118 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/models/gnmt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Elad Hoffer 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | # SOFTWARE. 21 | 22 | import torch.nn as nn 23 | 24 | import gnmt.seq2seq.data.config as config 25 | from gnmt.seq2seq.models.decoder import ResidualRecurrentDecoder 26 | from gnmt.seq2seq.models.encoder import ResidualRecurrentEncoder 27 | from gnmt.seq2seq.models.seq2seq_base import Seq2Seq 28 | 29 | 30 | class GNMT(Seq2Seq): 31 | """ 32 | GNMT v2 model 33 | """ 34 | def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2, 35 | batch_first=False, share_embedding=True): 36 | """ 37 | Constructor for the GNMT v2 model. 38 | 39 | :param vocab_size: size of vocabulary (number of tokens) 40 | :param hidden_size: internal hidden size of the model 41 | :param num_layers: number of layers, applies to both encoder and 42 | decoder 43 | :param dropout: probability of dropout (in encoder and decoder) 44 | :param batch_first: if True the model uses (batch,seq,feature) tensors, 45 | if false the model uses (seq, batch, feature) 46 | :param share_embedding: if True embeddings are shared between encoder 47 | and decoder 48 | """ 49 | 50 | super(GNMT, self).__init__(batch_first=batch_first) 51 | 52 | if share_embedding: 53 | embedder = nn.Embedding(vocab_size, hidden_size, 54 | padding_idx=config.PAD) 55 | nn.init.uniform_(embedder.weight.data, -0.1, 0.1) 56 | else: 57 | embedder = None 58 | 59 | self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size, 60 | num_layers, dropout, 61 | batch_first, embedder) 62 | 63 | self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size, 64 | num_layers, dropout, 65 | batch_first, embedder) 66 | 67 | def forward(self, input_encoder, input_enc_len, input_decoder): 68 | context = self.encode(input_encoder, input_enc_len) 69 | context = (context, input_enc_len, None) 70 | output, _, _ = self.decode(input_decoder, context) 71 | 72 | return output 73 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/models/seq2seq_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Elad Hoffer 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | # SOFTWARE. 21 | 22 | import torch.nn as nn 23 | from torch.nn.functional import log_softmax 24 | 25 | 26 | class Seq2Seq(nn.Module): 27 | """ 28 | Generic Seq2Seq module, with an encoder and a decoder. 29 | """ 30 | def __init__(self, encoder=None, decoder=None, batch_first=False): 31 | """ 32 | Constructor for the Seq2Seq module. 33 | 34 | :param encoder: encoder module 35 | :param decoder: decoder module 36 | :param batch_first: if True the model uses (batch, seq, feature) 37 | tensors, if false the model uses (seq, batch, feature) tensors 38 | """ 39 | super(Seq2Seq, self).__init__() 40 | self.encoder = encoder 41 | self.decoder = decoder 42 | self.batch_first = batch_first 43 | 44 | def encode(self, inputs, lengths): 45 | """ 46 | Applies the encoder to inputs with a given input sequence lengths. 47 | 48 | :param inputs: tensor with inputs (batch, seq_len) if 'batch_first' 49 | else (seq_len, batch) 50 | :param lengths: vector with sequence lengths (excluding padding) 51 | """ 52 | return self.encoder(inputs, lengths) 53 | 54 | def decode(self, inputs, context, inference=False): 55 | """ 56 | Applies the decoder to inputs, given the context from the encoder. 57 | 58 | :param inputs: tensor with inputs (batch, seq_len) if 'batch_first' 59 | else (seq_len, batch) 60 | :param context: context from the encoder 61 | :param inference: if True inference mode, if False training mode 62 | """ 63 | return self.decoder(inputs, context, inference) 64 | 65 | def generate(self, inputs, context, beam_size): 66 | """ 67 | Autoregressive generator, works with SequenceGenerator class. 68 | Executes decoder (in inference mode), applies log_softmax and topK for 69 | inference with beam search decoding. 70 | 71 | :param inputs: tensor with inputs to the decoder 72 | :param context: context from the encoder 73 | :param beam_size: beam size for the generator 74 | 75 | returns: (words, logprobs, scores, new_context) 76 | words: indices of topK tokens 77 | logprobs: log probabilities of topK tokens 78 | scores: scores from the attention module (for coverage penalty) 79 | new_context: new decoder context, includes new hidden states for 80 | decoder RNN cells 81 | """ 82 | logits, scores, new_context = self.decode(inputs, context, True) 83 | logprobs = log_softmax(logits, dim=-1) 84 | logprobs, words = logprobs.topk(beam_size, dim=-1) 85 | return words, logprobs, scores, new_context 86 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/train/smoothing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import torch 22 | import torch.nn as nn 23 | 24 | 25 | class LabelSmoothing(nn.Module): 26 | """ 27 | NLL loss with label smoothing. 28 | """ 29 | def __init__(self, padding_idx, smoothing=0.0): 30 | """ 31 | Constructor for the LabelSmoothing module. 32 | 33 | :param padding_idx: index of the PAD token 34 | :param smoothing: label smoothing factor 35 | """ 36 | super(LabelSmoothing, self).__init__() 37 | self.padding_idx = padding_idx 38 | self.confidence = 1.0 - smoothing 39 | self.smoothing = smoothing 40 | 41 | def forward(self, x, target): 42 | logprobs = torch.nn.functional.log_softmax(x, dim=-1, 43 | dtype=torch.float32) 44 | 45 | non_pad_mask = (target != self.padding_idx) 46 | nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) 47 | nll_loss = nll_loss.squeeze(1)[non_pad_mask] 48 | smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask] 49 | loss = self.confidence * nll_loss + self.smoothing * smooth_loss 50 | return loss.sum() 51 | -------------------------------------------------------------------------------- /related/baselines/gnmt/seq2seq/train/table.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | from pytablewriter import MarkdownTableWriter 22 | 23 | 24 | class TrainingTable: 25 | def __init__(self, acc_unit='BLEU', time_unit='min', perf_unit='tok/s'): 26 | self.data = [] 27 | self.acc_unit = acc_unit 28 | self.time_unit = time_unit 29 | self.perf_unit = perf_unit 30 | self.time_unit_convert = {'s': 1, 'min': 1/60, 'h': 1/3600} 31 | 32 | def add(self, gpus, batch_size, accuracy, perf, time_to_train): 33 | time_to_train *= self.time_unit_convert[self.time_unit] 34 | if not accuracy: 35 | accuracy = 0.0 36 | accuracy = round(accuracy, 2) 37 | self.data.append([gpus, batch_size, accuracy, perf, time_to_train]) 38 | 39 | def write(self, title, math): 40 | writer = MarkdownTableWriter() 41 | writer.table_name = f'{title}' 42 | 43 | header = [f'**GPUs**', 44 | f'**Batch Size / GPU**', 45 | f'**Accuracy - {math.upper()} ({self.acc_unit})**', 46 | f'**Throughput - {math.upper()} ({self.perf_unit})**', 47 | f'**Time to Train - {math.upper()} ({self.time_unit})**', 48 | ] 49 | writer.headers = header 50 | 51 | writer.value_matrix = self.data 52 | writer.write_table() 53 | -------------------------------------------------------------------------------- /related/baselines/nasnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/nasnet/__init__.py -------------------------------------------------------------------------------- /related/baselines/nasnet/train_nasnet.py: -------------------------------------------------------------------------------- 1 | import time 2 | from torchvision import models, datasets, transforms 3 | import torch 4 | import torch.nn.functional as F 5 | from nasnet.nasnet import NASNetALarge 6 | from nasnet.nasnet_mobile import NASNetAMobile 7 | from utils.sync_control import * 8 | 9 | 10 | def train_wrapper(sync_info, tid: int, model_config, shared_config): 11 | device = torch.device("cuda:0") 12 | my_stream = torch.cuda.Stream(device=device) 13 | arc = model_config['arc'] 14 | model = NASNetALarge(num_classes=1000) if arc == 'large' else NASNetAMobile(num_classes=1000) 15 | model = model.to(device) 16 | model.train() 17 | 18 | train_transform = transforms.Compose([ 19 | transforms.RandomResizedCrop(331 if arc == 'large' else 224), 20 | transforms.RandomHorizontalFlip(), 21 | transforms.ToTensor(), 22 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) 23 | 24 | train_dataset = \ 25 | datasets.ImageFolder(shared_config['imagenet_root'], transform=train_transform) 26 | 27 | train_loader = torch.utils.data.DataLoader( 28 | train_dataset, batch_size=model_config['batch_size'], shuffle=True, num_workers=model_config['num_workers']) 29 | metric_fn = F.cross_entropy 30 | optimizer_func = getattr(torch.optim, model_config['optimizer']) 31 | optimizer = optimizer_func(model.parameters(), lr=0.001) 32 | 33 | for batch_idx, batch in enumerate(train_loader): 34 | data, target = batch[0].to(device), batch[1].to(device) 35 | with ForwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream): 36 | with torch.cuda.stream(my_stream): 37 | output = model(data) 38 | loss = metric_fn(output, target) 39 | 40 | with BackwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream): 41 | with torch.cuda.stream(my_stream): 42 | loss.backward() 43 | optimizer.step() 44 | optimizer.zero_grad() 45 | -------------------------------------------------------------------------------- /related/baselines/requirements.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | wget 3 | -------------------------------------------------------------------------------- /related/baselines/retinanet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/retinanet/__init__.py -------------------------------------------------------------------------------- /related/baselines/retinanet/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/retinanet/model/__init__.py -------------------------------------------------------------------------------- /related/baselines/retinanet/model/focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def sigmoid_focal_loss( 6 | inputs: torch.Tensor, 7 | targets: torch.Tensor, 8 | alpha: float = 0.25, 9 | gamma: float = 2, 10 | reduction: str = "none", 11 | ): 12 | """ 13 | Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py . 14 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 15 | 16 | Args: 17 | inputs: A float tensor of arbitrary shape. 18 | The predictions for each example. 19 | targets: A float tensor with the same shape as inputs. Stores the binary 20 | classification label for each element in inputs 21 | (0 for the negative class and 1 for the positive class). 22 | alpha: (optional) Weighting factor in range (0,1) to balance 23 | positive vs negative examples or -1 for ignore. Default = 0.25 24 | gamma: Exponent of the modulating factor (1 - p_t) to 25 | balance easy vs hard examples. 26 | reduction: 'none' | 'mean' | 'sum' 27 | 'none': No reduction will be applied to the output. 28 | 'mean': The output will be averaged. 29 | 'sum': The output will be summed. 30 | Returns: 31 | Loss tensor with the reduction option applied. 32 | """ 33 | p = torch.sigmoid(inputs) 34 | ce_loss = F.binary_cross_entropy_with_logits( 35 | inputs, targets, reduction="none" 36 | ) 37 | p_t = p * targets + (1 - p) * (1 - targets) 38 | loss = ce_loss * ((1 - p_t) ** gamma) 39 | 40 | if alpha >= 0: 41 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 42 | loss = alpha_t * loss 43 | 44 | if reduction == "mean": 45 | loss = loss.mean() 46 | elif reduction == "sum": 47 | loss = loss.sum() 48 | 49 | return loss 50 | -------------------------------------------------------------------------------- /related/baselines/retinanet/model/image_list.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from typing import List, Tuple 4 | 5 | 6 | class ImageList(object): 7 | """ 8 | Structure that holds a list of images (of possibly 9 | varying sizes) as a single tensor. 10 | This works by padding the images to the same size, 11 | and storing in a field the original sizes of each image 12 | """ 13 | 14 | def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]): 15 | """ 16 | Args: 17 | tensors (tensor) 18 | image_sizes (list[tuple[int, int]]) 19 | """ 20 | self.tensors = tensors 21 | self.image_sizes = image_sizes 22 | 23 | def to(self, device: torch.device) -> 'ImageList': 24 | cast_tensor = self.tensors.to(device) 25 | return ImageList(cast_tensor, self.image_sizes) 26 | -------------------------------------------------------------------------------- /related/baselines/retinanet/model/roi_heads.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | 4 | import torch.nn.functional as F 5 | from torch import nn, Tensor 6 | 7 | from torchvision.ops import boxes as box_ops 8 | from torchvision.ops import roi_align 9 | 10 | from typing import Optional, List, Dict, Tuple 11 | 12 | from retinanet.model.utils import BoxCoder, Matcher 13 | 14 | 15 | def expand_boxes(boxes, scale): 16 | # type: (Tensor, float) -> Tensor 17 | w_half = (boxes[:, 2] - boxes[:, 0]) * .5 18 | h_half = (boxes[:, 3] - boxes[:, 1]) * .5 19 | x_c = (boxes[:, 2] + boxes[:, 0]) * .5 20 | y_c = (boxes[:, 3] + boxes[:, 1]) * .5 21 | 22 | w_half *= scale 23 | h_half *= scale 24 | 25 | boxes_exp = torch.zeros_like(boxes) 26 | boxes_exp[:, 0] = x_c - w_half 27 | boxes_exp[:, 2] = x_c + w_half 28 | boxes_exp[:, 1] = y_c - h_half 29 | boxes_exp[:, 3] = y_c + h_half 30 | return boxes_exp 31 | 32 | 33 | def expand_masks(mask, padding): 34 | # type: (Tensor, int) -> Tuple[Tensor, float] 35 | M = mask.shape[-1] 36 | scale = float(M + 2 * padding) / M 37 | padded_mask = F.pad(mask, (padding,) * 4) 38 | return padded_mask, scale 39 | 40 | 41 | def paste_mask_in_image(mask, box, im_h, im_w): 42 | # type: (Tensor, Tensor, int, int) -> Tensor 43 | TO_REMOVE = 1 44 | w = int(box[2] - box[0] + TO_REMOVE) 45 | h = int(box[3] - box[1] + TO_REMOVE) 46 | w = max(w, 1) 47 | h = max(h, 1) 48 | 49 | # Set shape to [batchxCxHxW] 50 | mask = mask.expand((1, 1, -1, -1)) 51 | 52 | # Resize mask 53 | mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) 54 | mask = mask[0][0] 55 | 56 | im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device) 57 | x_0 = max(box[0], 0) 58 | x_1 = min(box[2] + 1, im_w) 59 | y_0 = max(box[1], 0) 60 | y_1 = min(box[3] + 1, im_h) 61 | 62 | im_mask[y_0:y_1, x_0:x_1] = mask[ 63 | (y_0 - box[1]):(y_1 - box[1]), (x_0 - box[0]):(x_1 - box[0]) 64 | ] 65 | return im_mask 66 | 67 | 68 | def paste_masks_in_image(masks, boxes, img_shape, padding=1): 69 | # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor 70 | masks, scale = expand_masks(masks, padding=padding) 71 | boxes = expand_boxes(boxes, scale).to(dtype=torch.int64) 72 | im_h, im_w = img_shape 73 | res = [ 74 | paste_mask_in_image(m[0], b, im_h, im_w) 75 | for m, b in zip(masks, boxes) 76 | ] 77 | if len(res) > 0: 78 | ret = torch.stack(res, dim=0)[:, None] 79 | else: 80 | ret = masks.new_empty((0, 1, im_h, im_w)) 81 | return ret 82 | -------------------------------------------------------------------------------- /related/baselines/retinanet/presets.py: -------------------------------------------------------------------------------- 1 | import retinanet.transforms as T 2 | 3 | 4 | class DetectionPresetTrain: 5 | def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)): 6 | if data_augmentation == 'hflip': 7 | self.transforms = T.Compose([ 8 | T.RandomHorizontalFlip(p=hflip_prob), 9 | T.ToTensor(), 10 | ]) 11 | elif data_augmentation == 'ssd': 12 | self.transforms = T.Compose([ 13 | T.RandomPhotometricDistort(), 14 | T.RandomZoomOut(fill=list(mean)), 15 | T.RandomIoUCrop(), 16 | T.RandomHorizontalFlip(p=hflip_prob), 17 | T.ToTensor(), 18 | ]) 19 | elif data_augmentation == 'ssdlite': 20 | self.transforms = T.Compose([ 21 | T.RandomIoUCrop(), 22 | T.RandomHorizontalFlip(p=hflip_prob), 23 | T.ToTensor(), 24 | ]) 25 | else: 26 | raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"') 27 | 28 | def __call__(self, img, target): 29 | return self.transforms(img, target) 30 | 31 | 32 | class DetectionPresetEval: 33 | def __init__(self): 34 | self.transforms = T.ToTensor() 35 | 36 | def __call__(self, img, target): 37 | return self.transforms(img, target) 38 | 39 | -------------------------------------------------------------------------------- /related/baselines/retinanet/train_retinanet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | import numpy as np 4 | import retinanet.presets as presets 5 | import time 6 | from utils.sync_control import * 7 | from retinanet.model.retinanet import retinanet_from_backbone 8 | from retinanet.coco_utils import get_openimages, get_coco 9 | import utils 10 | 11 | def get_dataset_fn(name, shared_config): 12 | paths = { 13 | "coco": (get_coco, 91, shared_config['coco_root']), 14 | "openimages": (get_openimages, 601, None), # Full openimages dataset 15 | "openimages-mlperf": (get_openimages, None), # L0 classes with more than 1000 samples 16 | } 17 | return paths[name] 18 | 19 | 20 | def get_transform(train, data_augmentation): 21 | return presets.DetectionPresetTrain(data_augmentation) if train else presets.DetectionPresetEval() 22 | 23 | 24 | def collate_fn(batch): 25 | return tuple(zip(*batch)) 26 | 27 | 28 | def train_wrapper(sync_info, tid: int, model_config, shared_config): 29 | device = torch.device("cuda:0") 30 | my_stream = torch.cuda.Stream(device=device) 31 | seed = int(time.time()) 32 | torch.manual_seed(seed) 33 | np.random.seed(seed=seed) 34 | 35 | dataset_fn, num_classes, data_path = get_dataset_fn(model_config['dataset_name'], shared_config) 36 | data_layout = "channels_last" 37 | batch_size = model_config['batch_size'] 38 | model = retinanet_from_backbone(backbone='resnext50_32x4d', 39 | num_classes=num_classes, 40 | image_size=[800, 800], 41 | data_layout=data_layout, 42 | pretrained=False, 43 | trainable_backbone_layers=3) 44 | model.to(device) 45 | if data_layout == 'channels_last': 46 | model = model.to(memory_format=torch.channels_last) 47 | 48 | params = [p for p in model.parameters() if p.requires_grad] 49 | optimizer = torch.optim.Adam(params, lr=0.0001) 50 | 51 | # GradScaler for AMP 52 | scaler = torch.cuda.amp.GradScaler(enabled=model_config['use_amp']) 53 | 54 | dataset = dataset_fn(name=model_config['dataset_name'], 55 | root=data_path, 56 | image_set="train", 57 | transforms=get_transform(True, 'hflip')) 58 | train_sampler = torch.utils.data.RandomSampler(dataset) 59 | train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True) 60 | data_loader = torch.utils.data.DataLoader( 61 | dataset, batch_sampler=train_batch_sampler, num_workers=model_config['num_workers'], 62 | pin_memory=False, collate_fn=collate_fn) 63 | 64 | model.train() 65 | 66 | num_iterations = model_config['num_iterations'] 67 | warm_up_iters = model_config['warm_up_iters'] 68 | if shared_config['use_dummy_data']: 69 | train_dataloader_iter = iter(data_loader) 70 | images, targets = next(train_dataloader_iter) 71 | images = list(image.to(device) for image in images) 72 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 73 | virtual_loader = utils.DummyDataLoader(batch=(images, targets)) 74 | else: 75 | virtual_loader = data_loader 76 | 77 | logging.info(f'retinat is set up with {num_iterations}') 78 | 79 | for batch_idx, (images, targets) in enumerate(virtual_loader): 80 | if batch_idx == warm_up_iters: 81 | # finish previous work 82 | torch.cuda.synchronize(device) 83 | if not sync_info.no_sync_control: 84 | sync_info.barrier.wait() 85 | # start timer 86 | start_time = time.time() 87 | 88 | images = list(image.to(device) for image in images) 89 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 90 | with ForwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream): 91 | with torch.cuda.stream(my_stream): 92 | loss_dict = model(images, targets) 93 | losses = sum(loss for loss in loss_dict.values()) 94 | 95 | with BackwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream): 96 | with torch.cuda.stream(my_stream): 97 | scaler.scale(losses).backward() 98 | scaler.step(optimizer) 99 | scaler.update() 100 | optimizer.zero_grad() 101 | 102 | if batch_idx == num_iterations - 1: 103 | # reached the last iteration 104 | break 105 | 106 | sync_info.no_sync_control = True 107 | torch.cuda.synchronize(device) 108 | 109 | duration = time.time() - start_time 110 | logging.info(f'tid {tid} it takes {duration} seconds to train retinanet') 111 | return duration 112 | -------------------------------------------------------------------------------- /related/baselines/run_wrapper.sh: -------------------------------------------------------------------------------- 1 | datestr=$(date '+%H-%M-%S-%Y-%m-%d') 2 | 3 | python run.py > ${datestr}_output.log 2>&1 & 4 | disown 5 | -------------------------------------------------------------------------------- /related/baselines/start_MPS_control_daemon.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 # Select GPU 0. 2 | 3 | export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps # Select a location that’s accessible to the given $UID 4 | 5 | export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log # Select a location that’s accessible to the given $UID 6 | 7 | nvidia-cuda-mps-control -d # Start the daemon. 8 | -------------------------------------------------------------------------------- /related/baselines/stop_MPS_control_daemon.sh: -------------------------------------------------------------------------------- 1 | echo quit | nvidia-cuda-mps-control 2 | -------------------------------------------------------------------------------- /related/baselines/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/transformer/__init__.py -------------------------------------------------------------------------------- /related/baselines/transformer/transformer_consts.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # these configs serve as constants and are not supposed to be tuned; not all fields are used 3 | base: 4 | cuda: true 5 | n_layer: 16 6 | d_model: 512 7 | n_head: 8 8 | d_head: 64 9 | d_inner: 2048 10 | dropout: 0.1 11 | dropatt: 0.0 12 | optim: jitlamb 13 | lr: 0.01 14 | eta_min: 0.001 15 | roll: true 16 | warmup_step: 1000 17 | max_step: 40000 18 | tgt_len: 192 19 | mem_len: 192 20 | init_std: 0.02 21 | eval_tgt_len: 192 22 | log_interval: 10 23 | eval_interval: 5000 24 | vocab: word 25 | adaptive: true 26 | div_val: 1 27 | 28 | large: 29 | cuda: true 30 | n_layer: 18 31 | d_model: 1024 32 | n_head: 16 33 | d_head: 64 34 | d_inner: 4096 35 | dropout: 0.2 36 | dropatt: 0.2 37 | optim: jitlamb 38 | lr: 0.01 39 | eta_min: 0.0001 40 | roll: true 41 | warmup_step: 16000 42 | max_step: 100000 43 | tgt_len: 384 44 | mem_len: 384 45 | init_std: 0.005 46 | eval_tgt_len: 128 47 | log_interval: 100 48 | eval_interval: 5000 49 | vocab: word 50 | adaptive: true 51 | div_val: 4 52 | -------------------------------------------------------------------------------- /related/baselines/transformer/transformer_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/transformer/transformer_utils/__init__.py -------------------------------------------------------------------------------- /related/baselines/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | import json 4 | import itertools 5 | import torch 6 | from numpy import random 7 | import numpy as np 8 | import time 9 | from statistics import mean 10 | from utils.sync_info import BasicSyncInfo 11 | def pretty_time(): 12 | return datetime.now().strftime('%d-%m-%Y-%H-%M-%S') 13 | 14 | 15 | def dict2pretty_str(dict_data): 16 | return json.dumps(dict_data, indent=4) 17 | 18 | 19 | class DummyDataLoader: 20 | def __init__(self, batch): 21 | self.batch = batch 22 | 23 | def __iter__(self): 24 | return itertools.repeat(self.batch) 25 | 26 | 27 | percentile_positions = [50, 90, 95, 99] 28 | def measure(func, num_requests, num_warm_up_reqs, request_rate, tid, shared_config, stream, sync_info: BasicSyncInfo): 29 | """ 30 | Invoke the func {num_requests} times with first {num_warm_up_reqs} iterations as warm up. 31 | Measure how long each invocation takes and calculate statistics (average and percentiles) over them, 32 | and finally write all data via {sync_info}. 33 | """ 34 | distribution = shared_config['distribution'] 35 | if distribution=='trace' and tid==1: 36 | # uniform distribution for tid 1 37 | distribution = 'uniform' 38 | 39 | if request_rate == 0: 40 | intervals = [0] * num_requests 41 | else: 42 | scale = 1 / request_rate 43 | if distribution == 'trace': 44 | with open(shared_config['trace_path']) as f: 45 | intervals = json.load(f) 46 | num_requests = len(intervals) 47 | elif distribution == 'poisson': 48 | intervals = random.exponential(scale=scale, size=(num_requests,)) 49 | elif distribution == 'uniform': 50 | intervals = [scale] * num_requests 51 | else: 52 | raise NotImplementedError(f'unsupported distribution {distribution}') 53 | 54 | 55 | latency_history = [] 56 | 57 | with torch.no_grad(): 58 | next_startup = time.time() 59 | iteration = 0 60 | while True: 61 | if time.time() >= next_startup: 62 | if iteration == num_warm_up_reqs: 63 | sync_info.pre_measurement_prep(tid) 64 | entire_inference_start_time = time.time() 65 | # reset next_startup to have clear setup 66 | next_startup = entire_inference_start_time 67 | 68 | with torch.cuda.stream(stream): 69 | func() 70 | stream.synchronize() 71 | latency_history.append(1000 * (time.time() - next_startup)) 72 | 73 | if not sync_info.should_continue_loop(tid, iteration, num_requests): 74 | break 75 | 76 | next_startup += intervals[iteration] 77 | 78 | duration = next_startup - time.time() 79 | 80 | if duration > 0: 81 | time.sleep(duration) 82 | iteration += 1 83 | 84 | inference_duration = time.time() - entire_inference_start_time 85 | sync_info.post_measurement_prep(tid) 86 | # discard the first {num_warm_up_reqs} latencies 87 | latency_history = latency_history[num_warm_up_reqs:] 88 | mean_latency = mean(latency_history) 89 | percentiles = np.percentile(latency_history, percentile_positions) 90 | 91 | # data_to_record = { 92 | # f'latencies{tid}': latency_history, 93 | # f'mean_latency{tid}': mean_latency, 94 | # f'duration{tid}': inference_duration, 95 | # f'iterations{tid}': iteration + 1, 96 | # } 97 | # record percentiles 98 | data_to_record = {} 99 | for idx, percentile_pos in enumerate(percentile_positions): 100 | data_to_record[f'p{percentile_pos}-latency-{tid}'] = percentiles[idx] 101 | data_to_record[f'throughput-{tid}'] = (iteration-num_warm_up_reqs)/inference_duration 102 | # write all data to the data file 103 | sync_info.write_kvs(data_to_record) 104 | 105 | 106 | 107 | def seed_everything(seed: int): 108 | import random, os 109 | import numpy as np 110 | 111 | random.seed(seed) 112 | os.environ['PYTHONHASHSEED'] = str(seed) 113 | np.random.seed(seed) 114 | torch.manual_seed(seed) 115 | torch.cuda.manual_seed(seed) 116 | -------------------------------------------------------------------------------- /related/baselines/utils/data_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class DataManager: 5 | """ 6 | A class to encapsulate all the logic regarding writing the structured experiment results to a json file. 7 | """ 8 | 9 | def __init__(self, experiment_data_json_file): 10 | self.experiment_data_json_file = experiment_data_json_file 11 | # init the file 12 | self._dump_dict({}) 13 | 14 | def write_kv(self, key, value): 15 | """ 16 | Write the key-value pair to the json data file. 17 | 18 | This method is NOT thread/process-safe, the caller needs a 19 | synchronization mechanism, e.g. a lock, to ensure at most one writer exists at any time. 20 | """ 21 | with open(self.experiment_data_json_file, 'r') as f: 22 | dict_data = json.load(f) 23 | 24 | dict_data[key] = value 25 | self._dump_dict(dict_data) 26 | 27 | def write_kvs(self, kv_pairs): 28 | """ 29 | Write many key-value pairs to the json data file. 30 | 31 | This method is NOT thread/process-safe, the caller needs a 32 | synchronization mechanism, e.g. a lock, to eusure at most one writer exists at any time. 33 | """ 34 | dict_data = self.read_dict() 35 | 36 | dict_data.update(kv_pairs) 37 | self._dump_dict(dict_data) 38 | 39 | def _dump_dict(self, dict_data): 40 | with open(self.experiment_data_json_file, 'w') as f: 41 | json.dump(dict_data, f, indent=4) 42 | 43 | def read_dict(self): 44 | with open(self.experiment_data_json_file, 'r') as f: 45 | dict_data = json.load(f) 46 | 47 | return dict_data 48 | -------------------------------------------------------------------------------- /related/baselines/utils/sync_control.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from utils.sync_info import BasicSyncInfo 4 | import torch 5 | import logging 6 | 7 | 8 | # These classes make use of the `with` pattern in Python 9 | # to centralize tick-tock synchronization logic 10 | 11 | class ForwardControl: 12 | 13 | def __init__(self, thread_id: int, batch_idx: int, sync_info: BasicSyncInfo, stream: torch.cuda.Stream) -> None: 14 | # we assume thread 0 starts first 15 | if thread_id not in {0, 1}: 16 | raise ValueError("thread_id can be either zero or one") 17 | 18 | self.sync_info = sync_info 19 | self.thread_id = thread_id 20 | self.batch_idx = batch_idx 21 | self.stream = stream 22 | 23 | def __enter__(self) -> None: 24 | if self.sync_info.no_sync_control: 25 | return 26 | logging.debug(f'thread {self.thread_id} starts FORWARD {self.batch_idx}') 27 | if self.thread_id == 0: 28 | self.sync_info.eventf1.wait() 29 | self.sync_info.event_cudaf1.wait(self.stream) 30 | self.sync_info.eventf1.clear() 31 | else: 32 | self.sync_info.eventf0.wait() 33 | self.sync_info.event_cudaf0.wait(self.stream) 34 | self.sync_info.eventf0.clear() 35 | 36 | 37 | def __exit__(self, exc_type, exc_val, exc_tb) -> bool: 38 | if self.sync_info.no_sync_control: 39 | return exc_type is None 40 | logging.debug(f'thread {self.thread_id} ends FORWARD {self.batch_idx}') 41 | if self.thread_id == 0: 42 | self.sync_info.event_cudaf0.record(self.stream) 43 | self.sync_info.eventf0.set() 44 | else: 45 | self.sync_info.event_cudaf1.record(self.stream) 46 | self.sync_info.eventf1.set() 47 | # raise the exception as is if there is any 48 | return exc_type is None 49 | 50 | 51 | class BackwardControl: 52 | 53 | def __init__(self, thread_id: int, batch_idx: int, sync_info: BasicSyncInfo, stream: torch.cuda.Stream) -> None: 54 | # we assume thread 0 starts first 55 | if thread_id not in {0, 1}: 56 | raise ValueError("thread_id can be either zero or one") 57 | 58 | self.sync_info = sync_info 59 | self.thread_id = thread_id 60 | self.batch_idx = batch_idx 61 | self.stream = stream 62 | 63 | def __enter__(self) -> None: 64 | if self.sync_info.no_sync_control: 65 | return 66 | logging.debug(f'thread {self.thread_id} starts BACKWARD {self.batch_idx}') 67 | if self.thread_id == 0: 68 | self.sync_info.eventb1.wait() 69 | self.sync_info.event_cudab1.wait(self.stream) 70 | self.sync_info.eventb1.clear() 71 | else: 72 | self.sync_info.eventb0.wait() 73 | self.sync_info.event_cudab0.wait(self.stream) 74 | self.sync_info.eventb0.clear() 75 | 76 | def __exit__(self, exc_type, exc_val, exc_tb) -> bool: 77 | if self.sync_info.no_sync_control: 78 | return exc_type is None 79 | logging.debug(f'thread {self.thread_id} ends BACKWARD {self.batch_idx}') 80 | if self.thread_id == 0: 81 | self.sync_info.event_cudab0.record(self.stream) 82 | self.sync_info.eventb0.set() 83 | else: 84 | self.sync_info.event_cudab1.record(self.stream) 85 | self.sync_info.eventb1.set() 86 | 87 | # raise the exception as is if there is any 88 | return exc_type is None 89 | 90 | -------------------------------------------------------------------------------- /related/baselines/utils/sync_info.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import torch 3 | import multiprocessing 4 | import time 5 | from utils.data_manager import DataManager 6 | 7 | class BasicSyncInfo: 8 | def __init__(self, data_manager: DataManager, no_sync_control: bool): 9 | self.no_sync_control = no_sync_control 10 | self.data_manager = data_manager 11 | 12 | def pre_measurement_prep(self, tid): 13 | return 14 | 15 | def post_measurement_prep(self, tid): 16 | return 17 | 18 | def write_kv(self, key, value): 19 | self.data_manager.write_kv(key, value) 20 | 21 | def write_kvs(self, kv_pairs): 22 | self.data_manager.write_kvs(kv_pairs) 23 | 24 | def should_continue_loop(self, tid: int, current_iteration: int, total_iterations: int): 25 | return current_iteration < total_iterations - 1 26 | 27 | 28 | class TickTockSyncInfo(BasicSyncInfo): 29 | 30 | def __init__(self, data_manager: DataManager) -> None: 31 | super().__init__(data_manager, no_sync_control=False) 32 | self.barrier = threading.Barrier(2) 33 | self.lock = threading.Lock() 34 | # thread events - for thread synchronization 35 | eventf0 = threading.Event() 36 | eventb0 = threading.Event() 37 | 38 | eventf1 = threading.Event() 39 | eventb1 = threading.Event() 40 | 41 | event_cudaf0 = torch.cuda.Event() 42 | event_cudab0 = torch.cuda.Event() 43 | 44 | event_cudaf1 = torch.cuda.Event() 45 | event_cudab1 = torch.cuda.Event() 46 | 47 | eventf1.set() # t0 starts 48 | eventb1.set() 49 | 50 | self.eventf0 = eventf0 51 | self.eventf1 = eventf1 52 | self.eventb0 = eventb0 53 | self.eventb1 = eventb1 54 | self.event_cudaf0 = event_cudaf0 55 | self.event_cudab0 = event_cudab0 56 | self.event_cudaf1 = event_cudaf1 57 | self.event_cudab1 = event_cudab1 58 | self.start_time = None 59 | 60 | def pre_measurement_prep(self, tid): 61 | self.barrier.wait() 62 | 63 | if tid == 0: 64 | self.start_time = time.time() 65 | 66 | def post_measurement_prep(self, tid): 67 | self.no_sync_control = True 68 | # the other thread might already enter next foward control 69 | # before setting `no_sync_control`; set the flags to make it continue 70 | self.eventf0.set() 71 | self.eventf1.set() 72 | self.barrier.wait() 73 | if tid == 0: 74 | duration = time.time() - self.start_time 75 | self.write_kv('duration', duration) 76 | 77 | def write_kv(self, key, value): 78 | with self.lock: 79 | super().write_kv(key, value) 80 | 81 | def write_kvs(self, kv_pairs): 82 | with self.lock: 83 | super().write_kvs(kv_pairs) 84 | 85 | 86 | class ConcurrentSyncInfo(BasicSyncInfo): 87 | def __init__(self, data_manager: DataManager, num_clients, isolation_level): 88 | super().__init__(data_manager, no_sync_control=True) 89 | self.isolation_level = isolation_level 90 | assert isolation_level in ['thread', 'process'] 91 | if isolation_level == 'thread': 92 | self.barrier = threading.Barrier(num_clients) 93 | self.lock = threading.Lock() 94 | self.stop_signal = threading.Event() 95 | else: 96 | self.barrier = multiprocessing.Barrier(num_clients) 97 | self.lock = multiprocessing.Lock() 98 | self.stop_signal = multiprocessing.Event() 99 | self.start_time = None 100 | 101 | def pre_measurement_prep(self, tid): 102 | self.barrier.wait() 103 | if tid == 0: 104 | self.start_time = time.time() 105 | 106 | def post_measurement_prep(self, tid): 107 | # let the other part break out of the loop 108 | self.stop_signal.set() 109 | self.barrier.wait() 110 | if tid == 0: 111 | duration = time.time() - self.start_time 112 | self.write_kv("duration", duration) 113 | 114 | def write_kv(self, key, value): 115 | with self.lock: 116 | super().write_kv(key, value) 117 | 118 | def write_kvs(self, kv_pairs): 119 | with self.lock: 120 | super().write_kvs(kv_pairs) 121 | 122 | def should_continue_loop(self, tid: int, current_iteration: int, total_iterations: int): 123 | if tid == 0: 124 | return super().should_continue_loop(tid, current_iteration, total_iterations) 125 | else: 126 | return not self.stop_signal.is_set() 127 | -------------------------------------------------------------------------------- /related/baselines/vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/vision/__init__.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='orion', 5 | packages=find_packages(), 6 | version='0.1.0', 7 | description='Orion library', 8 | author='EASL', 9 | ) 10 | -------------------------------------------------------------------------------- /setup/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | WORKDIR /root 3 | 4 | RUN rm /etc/apt/sources.list.d/cuda.list 5 | RUN rm /etc/apt/sources.list.d/nvidia-ml.list 6 | RUN apt-get -y update 7 | RUN apt install -y software-properties-common 8 | RUN apt-get install -y vim wget git 9 | RUN apt install -y libjpeg-dev zlib1g-dev 10 | 11 | 12 | RUN apt -y install build-essential libssl-dev 13 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.19.6/cmake-3.19.6.tar.gz 14 | RUN tar -zxvf cmake-3.19.6.tar.gz 15 | RUN cd cmake-3.19.6 && ./bootstrap && make && make install && cp bin/cmake /bin/ && cd .. 16 | 17 | RUN apt update -y 18 | RUN apt install software-properties-common -y 19 | RUN add-apt-repository ppa:deadsnakes/ppa 20 | RUN apt install python3.8-dev -y 21 | 22 | RUN apt-get -y install python3-pip 23 | RUN python3.8 -m pip install --upgrade pip 24 | RUN python3.8 -m pip install pyyaml typing_extensions 25 | RUN python3.8 -m pip install Pillow 26 | RUN python3.8 -m pip install numpy 27 | 28 | RUN git clone --recursive https://github.com/pytorch/pytorch 29 | COPY orion-torch-changes.patch /root/pytorch/ 30 | RUN cd pytorch && git reset --hard 67ece03c8cd632cce9523cd96efde6f2d1cc8121 && git apply orion-torch-changes.patch && git submodule sync && git submodule update --init --recursive --jobs 0 && python3.8 setup.py develop && cd .. 31 | 32 | RUN git clone https://github.com/pytorch/vision.git 33 | RUN cd vision && git reset --hard da3794e90c7cf69348f5446471926729c55f243e && python3.8 setup.py develop && cd .. 34 | 35 | RUN echo "alias python=python3.8" >> /root/.bashrc 36 | SHELL ["source" , "/root/.bashrc"] 37 | SHELL ["/bin/sh", "-c"] 38 | 39 | 40 | RUN git clone https://github.com/NVIDIA/DeepLearningExamples.git 41 | COPY nvidia_deeplearning_changes.patch /root/DeepLearningExamples/ 42 | 43 | RUN cd DeepLearningExamples/ && git reset --hard 6610c05c330b887744993fca30532cbb9561cbde && git apply nvidia_deeplearning_changes.patch 44 | RUN cd /root/DeepLearningExamples/PyTorch/LanguageModeling/BERT && export BERT_PREP_WORKING_DIR=/root/DeepLearningExamples/PyTorch/LanguageModeling/BERT && python3.8 -m pip install -r requirements.txt && python3.8 -m pip install wget && bash data/create_datasets_from_start.sh 45 | 46 | RUN cd /root/DeepLearningExamples/PyTorch/LanguageModeling/Transformer-XL && pip install -r requirements.txt && bash getdata.sh 47 | -------------------------------------------------------------------------------- /setup/README.md: -------------------------------------------------------------------------------- 1 | We have set up a docker image: [fotstrt/orion-ae](https://hub.docker.com/repository/docker/fotstrt/orion-ae/general) with all packages pre-installed. 2 | This directory contains the Dockerfile used to create the image. 3 | 4 | If the user does not want to use this image, then please follow these steps: 5 | 6 | * Install CUDA 10.2 and CUDNN 7.6.5 (or use a base image containing both, such as: `nvcr.io/nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04` ) 7 | * Run `install.sh` 8 | * Install PyTorch from source: 9 | * `git clone --recursive https://github.com/pytorch/pytorch` 10 | * `cd pytorch` 11 | * `git reset --hard 67ece03c8cd632cce9523cd96efde6f2d1cc8121` 12 | * Apply a patch of changes for Orion: `git apply orion-torch-changes.patch` 13 | * `git submodule sync` 14 | * `git submodule update --init --recursive --jobs 0` 15 | * `python3.8 setup.py develop` 16 | 17 | * Install Torchvision from source: 18 | * `git clone https://github.com/pytorch/vision.git` 19 | * `cd vision` 20 | * `git reset --hard da3794e90c7cf69348f5446471926729c55f243e` 21 | * `python3.8 setup.py develop` 22 | -------------------------------------------------------------------------------- /setup/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get update 4 | sudo apt install software-properties-common 5 | sudo apt-get install vim wget git 6 | sudo apt install libjpeg-dev zlib1g-dev 7 | 8 | # cmake 9 | 10 | sudo apt install build-essential libssl-dev 11 | wget https://github.com/Kitware/CMake/releases/download/v3.19.6/cmake-3.19.6.tar.gz 12 | tar -zxvf cmake-3.19.6.tar.gz 13 | cd cmake-3.19.6 14 | ./bootstrap 15 | make 16 | sudo make install 17 | cp bin/cmake /bin/ 18 | cd .. 19 | 20 | # python 21 | 22 | sudo apt update 23 | sudo apt install software-properties-common 24 | sudo add-apt-repository ppa:deadsnakes/ppa 25 | sudo apt install python3.8-dev 26 | 27 | # pip 28 | 29 | sudo apt-get -y install python3-pip 30 | python3.8 -m pip install --upgrade pip 31 | python3.8 -m pip install pyyaml typing_extensions 32 | python3.8 -m pip install Pillow 33 | python3.8 -m pip install numpy -------------------------------------------------------------------------------- /src/cuda_capture/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CUDAINCLUDE=/usr/local/cuda-10.2/include/ 3 | CUDALIB=/usr/local/cuda-10.2/lib64 4 | 5 | libinttemp.so: utils_interc.cpp intercept_cudnn.cpp intercept_cublas.cpp intercept_temp.cpp 6 | $(CC) -O3 -fPIC -shared utils_interc.cpp intercept_cudnn.cpp intercept_cublas.cpp intercept_temp.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread -o libinttemp.so 7 | 8 | all: 9 | make libinttemp.so 10 | 11 | clean: 12 | rm -rf *.o libinttemp.so 13 | -------------------------------------------------------------------------------- /src/cuda_capture/README.md: -------------------------------------------------------------------------------- 1 | ### Basic library to capture CUDA calls 2 | 3 | This captures only cudaLaunchKernel and cudaMalloc for now. It is also applicable for PyTorch programs. 4 | 5 | ### Compile 6 | 7 | make all 8 | 9 | ### Run 10 | 11 | LD_PRELOAD=" 12 | 13 | -------------------------------------------------------------------------------- /src/cuda_capture/intercept_cublas.cpp: -------------------------------------------------------------------------------- 1 | /* Intercepts and overwrites CUBLAS calls */ 2 | 3 | #include "intercept_temp.h" 4 | 5 | cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) { 6 | 7 | int idx = get_idx(); 8 | assert (idx >= 0); 9 | cublasStatus_t status = CUBLAS_STATUS_SUCCESS; 10 | 11 | cublasSgemm_record blassgemm_record = { 12 | handle, 13 | transa, 14 | transb, 15 | m, 16 | n, 17 | k, 18 | alpha, 19 | A, 20 | lda, 21 | B, 22 | ldb, 23 | beta, 24 | C, 25 | ldc 26 | }; 27 | 28 | union func_data new_func_data; 29 | new_func_data.cublasSgemmRecord = blassgemm_record; 30 | func_record new_record = {CUBLAS_SGEMM_RECORD, new_func_data}; 31 | 32 | if (idx < *num_total_clients) { 33 | 34 | pthread_mutex_lock(mutexes[idx]); 35 | DEBUG_PRINT("[INTERCEPTER-CATCH]-[%d] Caught cublasSgemm_v2, handle is %p, index %d, m is %d, n is %d, k is %d\n", func_indexes[idx], handle, idx, m, n, k); 36 | kqueues[idx]->push(new_record); 37 | func_indexes[idx] += 1; 38 | pthread_mutex_unlock(mutexes[idx]); 39 | 40 | block(idx, mutexes, kqueues); 41 | } 42 | else { 43 | 44 | if (cublas_sgemm_func==NULL) { 45 | *(void **)(&cublas_sgemm_func) = dlsym(RTLD_NEXT, "cublasSgemm_v2"); 46 | assert(cublas_sgemm_func != NULL); 47 | } 48 | status = (*cublas_sgemm_func)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); 49 | assert (status == CUBLAS_STATUS_SUCCESS); 50 | DEBUG_PRINT("CUBLAS status is %d\n", status); 51 | 52 | } 53 | 54 | return status; 55 | 56 | } 57 | 58 | 59 | 60 | cublasStatus_t cublasSgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) { 61 | 62 | int idx = get_idx(); 63 | assert (idx >= 0); 64 | cublasStatus_t status = CUBLAS_STATUS_SUCCESS; 65 | 66 | cublasSgemm_record blassgemm_record = { 67 | handle, 68 | transa, 69 | transb, 70 | m, 71 | n, 72 | k, 73 | alpha, 74 | A, 75 | lda, 76 | B, 77 | ldb, 78 | beta, 79 | C, 80 | ldc 81 | }; 82 | 83 | union func_data new_func_data; 84 | new_func_data.cublasSgemmRecord = blassgemm_record; 85 | func_record new_record = {CUBLAS_SGEMM_RECORD, new_func_data}; 86 | 87 | if (idx < *num_total_clients) { 88 | 89 | pthread_mutex_lock(mutexes[idx]); 90 | DEBUG_PRINT("[INTERCEPTER-CATCH]-[%d] Caught cublasSgemm, handle is %p, index %d, m is %d, n is %d, k is %d\n", func_indexes[idx], handle, idx, m, n, k); 91 | kqueues[idx]->push(new_record); 92 | func_indexes[idx] += 1; 93 | pthread_mutex_unlock(mutexes[idx]); 94 | 95 | block(idx, mutexes, kqueues); 96 | } 97 | else { 98 | 99 | if (cublas_sgemm_func==NULL) { 100 | *(void **)(&cublas_sgemm_func) = dlsym(RTLD_NEXT, "cublasSgemm_v2"); 101 | assert(cublas_sgemm_func != NULL); 102 | } 103 | status = (*cublas_sgemm_func)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); 104 | assert (status == CUBLAS_STATUS_SUCCESS); 105 | DEBUG_PRINT("CUBLAS status is %d\n", status); 106 | 107 | } 108 | 109 | return status; 110 | 111 | } 112 | 113 | 114 | cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, long long int strideA, const float *B, int ldb, long long int strideB, const float *beta, float *C, int ldc, long long int strideC, int batchCount) { 115 | 116 | int idx = get_idx(); 117 | assert (idx >= 0); 118 | cublasStatus_t status = CUBLAS_STATUS_SUCCESS; 119 | 120 | cublasSgemmStridedBatched_record record = { 121 | handle, 122 | transa, 123 | transb, 124 | m, 125 | n, 126 | k, 127 | alpha, 128 | A, 129 | lda, 130 | strideA, 131 | B, 132 | ldb, 133 | strideB, 134 | beta, 135 | C, 136 | ldc, 137 | strideC, 138 | batchCount 139 | }; 140 | 141 | union func_data new_func_data; 142 | new_func_data.cublasSgemmStridedRecord = record; 143 | func_record new_record = {CUBLAS_SGEMM_STRIDED_RECORD, new_func_data}; 144 | 145 | if (idx < *num_total_clients) { 146 | 147 | pthread_mutex_lock(mutexes[idx]); 148 | DEBUG_PRINT("[INTERCEPTER-CATCH]-[%d] Caught cublasSgemmStridedBatched, handle is %p\n", func_indexes[idx], handle); 149 | kqueues[idx]->push(new_record); 150 | func_indexes[idx] += 1; 151 | pthread_mutex_unlock(mutexes[idx]); 152 | 153 | block(idx, mutexes, kqueues); 154 | 155 | } 156 | else { 157 | 158 | if (cublas_sgemm_strided_func==NULL) { 159 | *(void **)(&cublas_sgemm_strided_func) = dlsym(RTLD_NEXT, "cublasSgemmStridedBatched"); 160 | assert(cublas_sgemm_strided_func != NULL); 161 | } 162 | 163 | status = (*cublas_sgemm_strided_func)(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount); 164 | assert (status == CUBLAS_STATUS_SUCCESS); 165 | DEBUG_PRINT("CUBLAS status is %d\n", status); 166 | 167 | } 168 | 169 | return status; 170 | } 171 | 172 | cublasStatus_t cublasDestroy(cublasHandle_t handle) { 173 | 174 | DEBUG_PRINT("Caught a cublasDestroy! Do nothing!\n"); 175 | return CUBLAS_STATUS_SUCCESS; 176 | } 177 | -------------------------------------------------------------------------------- /src/cuda_capture/utils_interc.cpp: -------------------------------------------------------------------------------- 1 | #include "intercept_temp.h" 2 | 3 | int get_idx() { 4 | 5 | // Each client thread has a unique ID in the scheduler. 6 | // Based on the thread id that is captured, find the proper index 7 | 8 | #ifdef SYS_gettid 9 | pid_t tid = syscall(SYS_gettid); 10 | #else 11 | #error "SYS_gettid unavailable on this system" 12 | #endif 13 | 14 | 15 | int idx = -1; 16 | int clients = *num_total_clients; 17 | int num_tids = 2*clients+1; 18 | 19 | for (int i=0; i -1 && !affinity_set[idx]) { 43 | cpu_set_t mask; 44 | CPU_ZERO(&mask); 45 | CPU_SET(idx+offset, &mask); 46 | int result = sched_setaffinity(0, sizeof(mask), &mask); 47 | assert (result==0); 48 | affinity_set[idx] = true; 49 | } 50 | return idx; 51 | } 52 | 53 | void block(int idx, pthread_mutex_t** mutexes, queue** kqueues) { 54 | 55 | // make sure all pending operations have completed 56 | while (1) { 57 | pthread_mutex_lock(mutexes[idx]); 58 | volatile int sz = kqueues[idx]->size(); 59 | pthread_mutex_unlock(mutexes[idx]); 60 | if (sz==0) 61 | break; 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /src/scheduler/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | NVCC=/usr/local/cuda-10.2/bin/nvcc 3 | CFLAGS=-O3 4 | CUDAINCLUDE=/usr/local/cuda-10.2/include/ 5 | CUDALIB=/usr/local/cuda-10.2/lib64 6 | 7 | utils_sched.o: utils_sched.cpp 8 | $(NVCC) $(CFLAGS) -Xcompiler -fPIC -x cu -shared -c utils_sched.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread 9 | 10 | scheduler.o: scheduler.cpp 11 | $(NVCC) $(CFLAGS) -Xcompiler -fPIC -x cu -shared -c scheduler.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread 12 | 13 | scheduler_eval.o: scheduler_eval.cpp 14 | $(NVCC) $(CFLAGS) -Xcompiler -fPIC -x cu -shared -c scheduler_eval.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread 15 | 16 | scheduler_eval.so: scheduler_eval.o utils_sched.o 17 | $(CC) $(CFLAGS) -fPIC -shared utils_sched.o scheduler_eval.o -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread -o scheduler_eval.so 18 | 19 | all: 20 | make scheduler_eval.so 21 | 22 | clean: 23 | rm -rf *.o *.so 24 | -------------------------------------------------------------------------------- /src/scheduler/scheduler.h: -------------------------------------------------------------------------------- 1 | #include 2 | //#include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "utils_sched.h" 14 | 15 | //void* sched_func(void* args); 16 | 17 | class Scheduler { 18 | 19 | public: 20 | void profile_prep(queue** qbuffers, int num_clients, bool reef); 21 | void profile_reset(int num_clients); 22 | void* busy_wait_profile(int num_clients, int iter, bool warmup, int warmup_iters, bool reef, bool seq, int depth, int hp_limit, int update_start); 23 | void schedule_reef(vector frecords, int num_clients, int depth); 24 | int schedule_sequential(vector frecords, int num_clients, int start); 25 | 26 | }; 27 | 28 | //void* sched_func(void* sched); 29 | //Scheduler* sched_init(); 30 | -------------------------------------------------------------------------------- /src/scheduler_frontend.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | from ctypes import * 3 | import torch 4 | import numpy as np 5 | import os 6 | import time 7 | 8 | class PyScheduler: 9 | 10 | def __init__(self, sched_lib, num_clients): 11 | 12 | torch.cuda.set_device(0) 13 | self._scheduler = sched_lib.sched_init() 14 | self._sched_lib = sched_lib 15 | self._num_clients = num_clients 16 | 17 | def run_scheduler( 18 | self, 19 | barriers, 20 | tids, 21 | model_names, 22 | kernel_files, 23 | additional_kernel_files, 24 | num_kernels, 25 | additional_num_kernels, 26 | num_iters, 27 | profile, 28 | run_eval, 29 | reef, 30 | sequential, 31 | reef_depth, 32 | hp_limit, 33 | update_start, 34 | train 35 | ): 36 | 37 | print(f"REEF IS {reef}, SEQUENTIAL IS {sequential}") 38 | 39 | model_names_ctypes = [x.encode('utf-8') for x in model_names] 40 | lib_names = [x.encode('utf-8') for x in kernel_files] 41 | 42 | # convert 43 | IntAr = c_int * self._num_clients 44 | tids_ar = IntAr(*tids) 45 | num_kernels_ar = IntAr(*num_kernels) 46 | num_iters_ar = IntAr(*num_iters) 47 | 48 | CharAr = c_char_p * self._num_clients 49 | model_names_ctypes_ar = CharAr(*model_names_ctypes) 50 | lib_names_ar = CharAr(*lib_names) 51 | 52 | BoolAr = c_bool * self._num_clients 53 | train_ar = BoolAr(*train) 54 | 55 | print(train) 56 | self._sched_lib.argtypes = [c_void_p, c_int, POINTER(c_int), POINTER(c_char_p), POINTER(c_char_p), POINTER(c_int), POINTER(c_bool)] 57 | 58 | print(model_names, lib_names, tids) 59 | 60 | self._sched_lib.setup(self._scheduler, self._num_clients, tids_ar, model_names_ctypes_ar, lib_names_ar, num_kernels_ar, num_iters_ar, train_ar, reef) 61 | 62 | num_clients = len(tids) 63 | print(f"Num clients is {num_clients}") 64 | 65 | print(f"before starting, profile is {profile}") 66 | timings=[] 67 | 68 | if run_eval: 69 | if profile: 70 | barriers[0].wait() 71 | # run once to warm-up and setup 72 | self._sched_lib.schedule(self._scheduler, num_clients, True, 0, True, 1, reef, sequential, reef_depth, hp_limit, update_start) 73 | torch.cuda.synchronize() 74 | 75 | for j in range(num_clients): 76 | if (additional_kernel_files[j] is not None): 77 | new_kernel_file = additional_kernel_files[j].encode('utf-8') 78 | self._sched_lib.setup_change(self._scheduler, j, new_kernel_file, additional_num_kernels[j]) 79 | 80 | print("wait here") 81 | barriers[0].wait() #FIXME 82 | print("done!") 83 | 84 | # warmup 85 | self._sched_lib.schedule(self._scheduler, num_clients, True, 0, True, 10, reef, sequential, reef_depth, hp_limit, update_start) 86 | torch.cuda.synchronize() 87 | barriers[0].wait() 88 | 89 | start = time.time() 90 | print("call schedule") 91 | self._sched_lib.schedule(self._scheduler, num_clients, True, 0, False, 0, reef, sequential, reef_depth, hp_limit, update_start) 92 | barriers[0].wait() 93 | torch.cuda.synchronize() 94 | print(f"Total time is {time.time()-start}") 95 | 96 | else: 97 | for i in range(num_iters[0]): 98 | 99 | print(f"Start {i} iteration") 100 | if profile: 101 | barriers[0].wait() 102 | # needed for backward 103 | if (i==1): 104 | for j in range(num_clients): 105 | if (additional_kernel_files[j] is not None): 106 | new_kernel_file = additional_kernel_files[j].encode('utf-8') 107 | self._sched_lib.setup_change(self._scheduler, j, new_kernel_file, additional_num_kernels[j]) 108 | barriers[0].wait() #FIXME 109 | 110 | start = time.time() 111 | print("call schedule") 112 | self._sched_lib.schedule(self._scheduler, num_clients, True, i) 113 | torch.cuda.synchronize() 114 | 115 | # or this 116 | else: 117 | start = time.time() 118 | for j in range(num_clients): 119 | barriers[j].wait() 120 | self._sched_lib.schedule_one(self._scheduler, j) 121 | torch.cuda.synchronize() 122 | 123 | total_time = time.time()-start 124 | print(f"Iteration {i} took {total_time} sec") 125 | timings.append(total_time) 126 | timings = timings[3:] 127 | print(f"Avg is {np.median(np.asarray(timings))}, Min is {min(timings)} sec") 128 | -------------------------------------------------------------------------------- /src/system_utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifdef DEBUG 4 | # define DEBUG_PRINT(...) fprintf(stdout, __VA_ARGS__) 5 | #else 6 | # define DEBUG_PRINT(...) do {} while (0) 7 | #endif 8 | 9 | using namespace std; --------------------------------------------------------------------------------