├── .gitignore
├── ARCHITECTURE.md
├── DEBUGGING.md
├── INSTALL.md
├── LICENSE
├── PROFILE.md
├── README.md
├── artifact_evaluation
    ├── README.md
    ├── example
    │   ├── README.md
    │   ├── config.json
    │   └── resnet50_4_fwd
    ├── fig10
    │   ├── client_0.json
    │   ├── config_files
    │   │   ├── bert_mnet.json
    │   │   ├── bert_rnet.json
    │   │   ├── ideal
    │   │   │   ├── mnet_inf.json
    │   │   │   └── rnet_inf.json
    │   │   ├── mnet_mnet.json
    │   │   ├── mnet_rnet.json
    │   │   ├── mps
    │   │   │   ├── config.yaml
    │   │   │   └── run.py
    │   │   ├── rnet101_mnet.json
    │   │   ├── rnet101_rnet.json
    │   │   ├── rnet_mnet.json
    │   │   ├── rnet_rnet.json
    │   │   ├── trans_mnet.json
    │   │   └── trans_rnet.json
    │   ├── gather_results.py
    │   ├── inter_arrival_times.json
    │   ├── plot_latency.py
    │   ├── prep_dirs.sh
    │   ├── run_ideal.py
    │   ├── run_orion.py
    │   └── run_reef.py
    └── fig7
    │   ├── config_files
    │       ├── bert_mnet.json
    │       ├── bert_rnet.json
    │       ├── ideal
    │       │   ├── bert_train.json
    │       │   ├── mnet_inf.json
    │       │   ├── mnet_train.json
    │       │   ├── rnet101_train.json
    │       │   ├── rnet_inf.json
    │       │   ├── rnet_train.json
    │       │   └── trans_train.json
    │       ├── mnet_mnet.json
    │       ├── mnet_rnet.json
    │       ├── mps
    │       │   ├── config.yaml
    │       │   ├── eval-resnet50train-resnet50-1.log
    │       │   ├── eval-resnet50train-resnet50-1.log.json
    │       │   ├── gen_conf_eval-resnet50train-resnet50.yaml
    │       │   └── run.py
    │       ├── rnet101_mnet.json
    │       ├── rnet101_rnet.json
    │       ├── rnet_mnet.json
    │       ├── rnet_rnet.json
    │       ├── trans_mnet.json
    │       └── trans_rnet.json
    │   ├── gather_latency.py
    │   ├── gather_throughput.py
    │   ├── kernel_files
    │       ├── mobilenetv2_4_fwd
    │       ├── mobilenetv2_64_fb0
    │       ├── mobilenetv2_64_fb1
    │       ├── resnet101_32_fb0
    │       ├── resnet101_32_fb1
    │       ├── resnet101_4_fwd
    │       ├── resnet50_32_fb0
    │       ├── resnet50_32_fb1
    │       └── resnet50_4_fwd
    │   ├── plot_latency.py
    │   ├── plot_throughput.py
    │   ├── prep_dirs.sh
    │   ├── run_ideal.py
    │   ├── run_orion.py
    │   └── run_reef.py
├── benchmarking
    ├── be.json
    ├── benchmark_suite
    │   ├── bert_trainer_mock.py
    │   ├── bert_trainer_mock_torch.py
    │   ├── compute_optimal.py
    │   ├── conv_trainer.py
    │   ├── examples
    │   │   ├── basic_config_bert.json
    │   │   ├── basic_config_transformer.json
    │   │   └── basic_config_vision.json
    │   ├── extract_meas.py
    │   ├── toy_models
    │   │   ├── bnorm_trainer.py
    │   │   └── conv_bn_trainer.py
    │   ├── train_imagenet.py
    │   ├── train_imagenet_torch.py
    │   ├── transformer_trainer.py
    │   ├── transformer_trainer_torch.py
    │   └── utility_scripts
    │   │   ├── check_unknown.py
    │   │   ├── compute_average.py
    │   │   ├── download_imagenet.sh
    │   │   └── get_avg.py
    ├── hp.json
    ├── launch_jobs.py
    ├── model_kernels
    │   ├── bert_2_fwd
    │   ├── bert_8_fb0
    │   ├── bert_8_fb1
    │   ├── mobilenetv2_32_fb0
    │   ├── mobilenetv2_32_fb1
    │   ├── mobilenetv2_4_fwd
    │   ├── mobilenetv2_64_fb0
    │   ├── mobilenetv2_64_fb1
    │   ├── mobilenetv2_96_fb0
    │   ├── mobilenetv2_96_fb1
    │   ├── resnet101_32_fb0
    │   ├── resnet101_32_fb1
    │   ├── resnet101_4_fwd
    │   ├── resnet50_32_fb0
    │   ├── resnet50_32_fb1
    │   ├── resnet50_4_fwd
    │   ├── transformer_xl_4_fwd
    │   ├── transformer_xl_8_fb0
    │   └── transformer_xl_8_fb1
    ├── multi_client_example.json
    └── scripts
    │   ├── run.sh
    │   ├── run_squad_test.py
    │   └── run_traces.py
├── compile.sh
├── orion_architecture.png
├── profiling
    ├── benchmarks
    │   ├── bert.py
    │   ├── bnorm.py
    │   ├── conv.py
    │   ├── conv_bnorm.py
    │   ├── gnmt.py
    │   ├── retinanet.py
    │   ├── transformer.py
    │   └── vision_models.py
    └── postprocessing
    │   ├── generate_file.py
    │   ├── get_num_blocks.py
    │   ├── process_ncu.py
    │   ├── process_nsys.py
    │   ├── profiles
    │       ├── bert_2_fwd_new
    │       ├── efficientnet_4_fwd_new
    │       ├── mobilenetv2_4_fwd_new
    │       ├── resnet101_4_fwd_new
    │       ├── resnet50_32_fb1_new
    │       ├── resnet50_4_fwd_new
    │       ├── retinanet_4_fwd_new
    │       └── transformer_4_fwd_new
    │   └── roofline_analysis.py
├── related
    ├── Tick-Tock
    │   └── test.json
    └── baselines
    │   ├── README.md
    │   ├── bert
    │       ├── __init__.py
    │       ├── modeling.py
    │       ├── optimization.py
    │       ├── schedulers.py
    │       ├── squad_example.py
    │       ├── tokenization.py
    │       └── train_bert_on_squad.py
    │   ├── config.yaml
    │   ├── dcgan
    │       ├── __init__.py
    │       ├── dcgan.py
    │       └── train_dcgan.py
    │   ├── gnmt
    │       ├── __init__.py
    │       ├── seq2seq
    │       │   ├── data
    │       │   │   ├── config.py
    │       │   │   ├── dataset.py
    │       │   │   ├── sampler.py
    │       │   │   └── tokenizer.py
    │       │   ├── gpu_affinity.py
    │       │   ├── inference
    │       │   │   ├── beam_search.py
    │       │   │   ├── tables.py
    │       │   │   └── translator.py
    │       │   ├── models
    │       │   │   ├── attention.py
    │       │   │   ├── decoder.py
    │       │   │   ├── encoder.py
    │       │   │   ├── gnmt.py
    │       │   │   └── seq2seq_base.py
    │       │   ├── train
    │       │   │   ├── fp_optimizers.py
    │       │   │   ├── lr_scheduler.py
    │       │   │   ├── smoothing.py
    │       │   │   ├── table.py
    │       │   │   └── trainer.py
    │       │   └── utils.py
    │       └── train_gnmt.py
    │   ├── inter_arrival_times.json
    │   ├── main.py
    │   ├── nasnet
    │       ├── __init__.py
    │       ├── nasnet.py
    │       ├── nasnet_mobile.py
    │       └── train_nasnet.py
    │   ├── requirements.txt
    │   ├── retinanet
    │       ├── __init__.py
    │       ├── coco_utils.py
    │       ├── model
    │       │   ├── __init__.py
    │       │   ├── anchor_utils.py
    │       │   ├── backbone_utils.py
    │       │   ├── boxes.py
    │       │   ├── feature_pyramid_network.py
    │       │   ├── focal_loss.py
    │       │   ├── image_list.py
    │       │   ├── resnet.py
    │       │   ├── retinanet.py
    │       │   ├── roi_heads.py
    │       │   ├── transform.py
    │       │   └── utils.py
    │       ├── presets.py
    │       ├── train_retinanet.py
    │       └── transforms.py
    │   ├── run.py
    │   ├── run_wrapper.sh
    │   ├── start_MPS_control_daemon.sh
    │   ├── stop_MPS_control_daemon.sh
    │   ├── transformer
    │       ├── __init__.py
    │       ├── data_utils.py
    │       ├── lamb.py
    │       ├── mem_transformer.py
    │       ├── train_transformer.py
    │       ├── transformer_consts.yaml
    │       └── transformer_utils
    │       │   ├── __init__.py
    │       │   ├── log_uniform_sampler.py
    │       │   ├── proj_adaptive_softmax.py
    │       │   └── vocabulary.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── data_manager.py
    │       ├── sync_control.py
    │       └── sync_info.py
    │   └── vision
    │       ├── __init__.py
    │       └── train_imagenet.py
├── setup.py
├── setup
    ├── Dockerfile
    ├── README.md
    ├── install.sh
    ├── nvidia_deeplearning_changes.patch
    └── orion-torch-changes.patch
└── src
    ├── cuda_capture
        ├── Makefile
        ├── README.md
        ├── intercept_cublas.cpp
        ├── intercept_cudnn.cpp
        ├── intercept_temp.cpp
        ├── intercept_temp.h
        └── utils_interc.cpp
    ├── scheduler
        ├── Makefile
        ├── scheduler.h
        ├── scheduler_eval.cpp
        ├── utils_sched.cpp
        └── utils_sched.h
    ├── scheduler_frontend.py
    └── system_utils.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | *.sqlite
 3 | *.qdrep
 4 | *.pyc
 5 | *.so
 6 | *.o
 7 | *.ncu-rep
 8 | *.svg
 9 | *.png
10 | .DS_Store
11 | .idea/
12 | __pycache__/
13 | orion.egg-info/
14 | benchmarking/examples/*
15 | benchmarking/eval/*
16 | benchmarking/results/*
17 | results
18 | 


--------------------------------------------------------------------------------
/ARCHITECTURE.md:
--------------------------------------------------------------------------------
 1 | The Orion system is depicted in the following image:
 2 | 
 3 | ![Orion architecture](orion_architecture.png)
 4 | 
 5 | CUDA/CUDNN/CUBLAS calls are intercepted and submitted into software queues managed by the scheduler.
 6 | Each submitted workload is profiled before being run, and the resource profiles of each operator are given as inputs to the scheduler. As depicted in the image, Orion currently supports 1 high-prioriy client, and multiple best-effort clients.
 7 | 
 8 | ### Scheduling Policy
 9 | 
10 | The scheduler polls for new operations from the clients. If an operator from a high-priority client is found, it is submitted directly in the GPU.
11 | If an operator from a best-effort client is found, Orion submits it based on its resource profile, number of SMs it needs, and the duration of on-the-fly best-effort kernels.
12 | 


--------------------------------------------------------------------------------
/DEBUGGING.md:
--------------------------------------------------------------------------------
1 | ### For CUDNN debugging:
2 | * export CUDNN_LOGDEST_DBG=stdout
3 | * export CUDNN_LOGINFO_DBG=1
4 | 
5 | ### For CUBLAS debugging:
6 | * export CUBLAS_LOGDEST_DBG=stdout
7 | * export CUBLAS_LOGINFO_DBG=1
8 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ### Use Docker image
 2 | 
 3 | We have set up a docker image: [fotstrt/orion-ae](https://hub.docker.com/repository/docker/fotstrt/orion-ae/general) with all packages pre-installed. We assume NVIDIA drivers are installed in the source machine, and that docker containers can use the host machine's GPUs.
 4 | 
 5 | * Start a container with `docker run --gpus=1 -it fotstrt/orion-ae:v1 bash`
 6 | * Download the Orion repo and install:
 7 |     * `git clone https://github.com/eth-easl/orion.git`
 8 |     * `cd orion`
 9 |     * `bash compile.sh`
10 |     * `pip install -e .`
11 | 
12 | 
13 | ### Without Docker image
14 | 
15 | In order to use Orion without our pre-built image, a user must install:
16 | * [NVIDIA CUDA](https://developer.nvidia.com/cuda-toolkit). We have tested Orion with CUDA 10.2 and CUDA 11.3
17 | * (optionally) [NVIDIA CUDNN](https://developer.nvidia.com/cudnn)
18 | * Pytorch (from source) + TorchVision
19 | * Download the Orion repo and install:
20 |     * `git clone https://github.com/eth-easl/orion.git`
21 |     * `cd orion`
22 |     * `bash compile.sh`
23 |     * `pip install -e .`


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 - present | ETH Zurich
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PROFILE.md:
--------------------------------------------------------------------------------
 1 | ## Instructions on kernel-level analysis with NVIDIA Nsight and PyTorch
 2 | 
 3 | ### Notes:
 4 | 1. the locations of nsys and nsight-cu-cli may vary from this guide
 5 | 2. This guide assumes the user has setup a `script.py` to profile
 6 | 
 7 | ### Profiling
 8 | 1. Setup Torch-addons for NCU: Use `torch.cuda.nvtx.range_push("start")`  and `torch.cuda.nvtx.range_pop()` around the region to profile.
 9 | 2. Setup Torch-addons for NSYS: Use `torch.cuda.profiler.cudart().cudaProfilerStart()`  and `torch.cuda.profiler.cudart().cudaProfilerStop()` around the region to profile.
10 | 3. Allow for NSYS profiling: `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`
11 | 4. Profile with NCU: `sudo /opt/nvidia/nsight-compute/2021.2.0/nv-nsight-cu-cli -o output_ncu --set detailed --nvtx --nvtx-include "start/" python3 script.py`
12 | 5. Profile with NCU in CSV: `sudo /opt/nvidia/nsight-compute/2021.2.0/nv-nsight-cu-cli  --csv --set detailed --nvtx --nvtx-include "start/" python3 script.py  > output_ncu.csv`
13 | 6. Profile with NSYS: `nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s none -o output_nsys --cudabacktrace=true --capture-range=cudaProfilerApi --stop-on-range-end=true  -f true -x true python3 script.py`
14 | 7. Convert NSYS output to CSV: `nsys stats --report gputrace --format csv,column --output .,- output_nsys.qdrep`
15 | 
16 | At this point, 4 files should have been generated:
17 | * `output_ncu.ncu-rep`
18 | * `output_ncu.csv`
19 | * `output_nsys.qdrep`
20 | * `output_nsys_gputrace.csv`
21 | 
22 | Using Nsight Compute, open the `output_ncu.ncu-rep` file, and download the raw csv file as `raw_ncu.csv`.
23 | 
24 | 
25 | ### Extracting resource utilization info
26 | Extract the required information from the profiling files:
27 | * `python profiling/postprocessing/process_ncu.py --results_dir <path to profiling files directory>`
28 | 
29 | If the `output_ncu.csv` file contains any program logs that do not conform with the `.csv` format, this command might throw errors.
30 | 
31 | Make sure the file is in a correct `.csv` format: depending on the NVIDIA CUDA version, and the type of profiling, the first line should look like that:
32 | 
33 | `"ID","Process ID","Process Name","Host Name","thread Domain:Push/Pop_Range:PL_Type:PL_Value:CLR_Type:Color:Msg_Type:Msg","Id:Domain:Start/Stop_Range:PL_Type:PL_Value:CLR_Type:Color:Msg_Type:Msg","Kernel Name","Kernel Time","Context","Stream","Section Name","Metric Name","Metric Unit","Metric Value","Rule Name","Rule Type","Rule Description"`
34 | 
35 | 
36 | * `python profiling/postprocessing/get_num_blocks.py --results_dir <path to profiling files directory> --max_threads_sm <max_threads_per_sm> --max_blocks_sm <max_blocks_per_sm> --max_shmem_sm <max_shared_memory_per_sm> --max_regs_sm <max_registers_per_sm>`
37 | 
38 | You can find the maximum number of threads, blocks, shared memory and registers per SM in the GPU's architecture description.
39 | By default, the `get_num_blocks.py` is configured for the [NVIDIA Tesla V100 GPU](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf).
40 | 
41 | * `python profiling/postprocessing/roofline_analysis.py --results_dir <path to profiling files directory> --ai_threshold <ai_threshold>`
42 | 
43 | Note that `ai_threshold` stands for the 'knee' arithmetic intensity of the roofline plot taken from the Nsight Compute tool, and might be different for each GPU.
44 | 
45 | After these steps, an `output_ncu_sms_roofline.csv` should have been generated.
46 | 
47 | ### (Optional) Plot traces
48 | You can use the  `profiling/postprocessing/process_nsys.py` file to generate resource utilization plot traces over time.
49 | * `python profiling/postprocessing/process_nsys.py --results_dir <path to profiling files directory> --max_sms <max SMs in the GPU> --metric <SM | Comp | Mem>`
50 | 
51 | ### Postprocessing to convert to a kernel info file for Orion to use
52 | This reads the profiling file and keeps the necessary information needed for each kernel (Number of SMs, Profile, Duration).
53 | It also groups kernels into operators, e.g. if a CUDNN Convolution operator has 2 kernels, it will group them into one operator.
54 | * `python profiling/postprocessing/generate_file.py --input_file_name <path to the output_ncu_sms_roofline.csv file> --output_file_name <path to output file> --model_type <vision | bert | transformer>`
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Orion
 2 | 
 3 | Orion is a fine-grained scheduler for interference-free GPU sharing across ML workloads. It is based on our EuroSys'24 paper "Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications".
 4 | 
 5 | ## Table of Contents
 6 | - [Introduction](#introduction)
 7 | - [Example](#example)
 8 | - [Project Structure](#project-structure)
 9 | - [Hardware Requirement](#hardware-requirement)
10 | - [Hardware Configuration used in the paper](#hardware-configuration-used-in-the-paper)
11 | - [Installation](#installation)
12 | - [Debugging](#debugging)
13 | - [Paper](#paper)
14 | 
15 | ## Introduction
16 | 
17 | Orion is a fine-grained, interference-free scheduler for GPU sharing across ML workloads. We assume one of the clients is high-priority, while the rest of the clients are best-effort.
18 | 
19 | Orion intercepts CUDA, CUDNN, and CUBLAS calls and submits them into software queues.
20 | The _Scheduler_ polls these queues and schedules operations based on their resource requirements and their priority. See [ARCHITECTURE](ARCHITECTURE.md) for more details on the system and the scheduling policy.
21 | 
22 | Orion expects that each submitted job has a file where all of its operations, along with their profiles and Straming Multiprocessor (SM) requirements are listed. See [PROFILE](PROFILE.md) for detailed instructions on how to profile a client applications, and how to generate the profile files.
23 | 
24 | ## Example
25 | 
26 | We have set up a docker image: [fotstrt/orion-ae](https://hub.docker.com/repository/docker/fotstrt/orion-ae/general) with all packages pre-installed.
27 | Alternatively, follow the instructions on the 'setup' directory, and check [INSTALL](INSTALL.md), to install Orion and its dependencies.
28 | 
29 | See [PROFILE](PROFILE.md) to generate profiling files for each workload.
30 | Create a json file containing all the info for the workloads that are about to share the GPU. See examples under 'artifact_evaluation/example'.
31 | 
32 | The file 'launch_jobs.py' is responsible for spawning the scheduler and the application thread(s).
33 | 
34 | ## Project Structure
35 | ```
36 | > tree .
37 | ├── profiling                     # Scripts and instructions for profiling
38 | │   ├── benchmarks                # Scripts of DNN models for profiling
39 | │   ├── postprocessing            # Scripts for processing of profile files
40 | └── src                           # Source code
41 | │   ├── cuda_capture              # Code to intercept CUDA/CUDNN/CUBLAS calls
42 | │   └── scheduler                 # Implementation of the scheduling policy
43 | │   └── scheduler_frontend.py     # Python interface for the Orion scheduler
44 | └── benchmarking                  # Scripts and configuration files for benchmarking
45 | |   ├── benchmark_suite           # Training and inference scripts
46 | |   ├── model_kernels             # Files containing profile information for the submitted models
47 | └── related                       # Some of the related baselines: MPS, Streams, Tick-Tock
48 | └── artifact_evaluation           # Scripts and instructions for artifact evaluation
49 | |   ├── example                   # Basic example to test Orion functionality
50 | |   ├── fig7                      # Scripts to reproduce Figure 7 of the paper
51 | |   ├── fig10                     # Scripts to reproduce Figure 10 of the paper
52 | └── setup                         # Instructions and scripts to install Orion's prerequisites.
53 | ```
54 | 
55 | ## Hardware Requirements
56 | Orion currently supports NVIDIA GPUs.
57 | 
58 | ## Hardware Configuration used in the paper
59 | For the experiments presented in the paper, we evaluated Orion in Google Cloud Platform VMs with the following configurations:
60 | * n1-standard-8 VM (8 vCPUs, 30GB of DRAM) with an V100-16GB GPU, with CUDA 10.2
61 | * a2-highgpu-1g VM (12 vCPUs, 85GB of DRAM) with an A100-40GB GPU, with CUDA 11.3
62 | 
63 | In both cases, the machines have Ubuntu 18.04.
64 | 
65 | ## Installation
66 | see [INSTALL](INSTALL.md).
67 | 
68 | ## Debugging
69 | see [DEBUGGING](DEBUGGING.md).
70 | 
71 | ## Paper
72 | If you use Orion, please cite our paper:
73 | ```bibtex
74 | @inproceedings{eurosys24orion,
75 |   author = {Strati, Foteini and Ma, Xianzhe and Klimovic, Ana},
76 |   title = {Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications},
77 |   year = {2024},
78 |   isbn = {9798400704376},
79 |   publisher = {Association for Computing Machinery},
80 |   address = {New York, NY, USA},
81 |   url = {https://doi.org/10.1145/3627703.3629578},
82 |   doi = {10.1145/3627703.3629578},
83 |   booktitle = {Proceedings of the Nineteenth European Conference on Computer Systems},
84 |   pages = {1075–1092},
85 |   numpages = {18},
86 |   keywords = {GPUs, Machine Learning},
87 |   location = {Athens, Greece},
88 |   series = {EuroSys '24}
89 | }
90 | ```
91 | 


--------------------------------------------------------------------------------
/artifact_evaluation/example/README.md:
--------------------------------------------------------------------------------
1 | This is a simple example to check that Orion has been installed correctly and can run.
2 | 
3 | Please follow the instructions in [INSTALL](INSTALL.md) to start a container with our image.
4 | Then start the Orion process (server and client) by running:
5 | * `cd /root/orion/benchmarking`
6 | * `LD_PRELOAD="/root/orion/src/cuda_capture/libinttemp.so" python launch_jobs.py /root/orion/artifact_evaluation/example/config.json 1 1 1`


--------------------------------------------------------------------------------
/artifact_evaluation/example/config.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd",
 5 |         "num_kernels": 175,
 6 |         "num_iters": 2000,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 |             "rps": 30,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	}
16 | ]
17 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/client_0.json:
--------------------------------------------------------------------------------
1 | {"p50_latency": 19.11342144012451, "p95_latency": 25.438904762268066, "p99_latency": 104.0643930435141, "throughput": 19.992602003050518}


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/bert_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "arch": "bert",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/bert_2_fwd",
 5 | 		"num_kernels": 572,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"batchsize": 2,
 9 | 			"rps": 8,
10 | 			"uniform": true,
11 | 			"dummy_data": true,
12 | 			"train": false
13 | 		}
14 |     },
15 |     {
16 | 		"arch": "mobilenet_v2",
17 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
18 | 		"num_kernels": 152,
19 | 		"num_iters": 6240,
20 | 		"args": {
21 | 			"model_name": "mobilenet_v2",
22 | 			"batchsize": 4,
23 |             "rps": 0,
24 | 			"uniform": true,
25 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
26 | 			"dummy_data": true,
27 | 			"train": false
28 | 		}
29 | 	}
30 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/bert_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "arch": "bert",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/bert_2_fwd",
 5 | 		"num_kernels": 572,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"batchsize": 2,
 9 | 			"rps": 8,
10 | 			"uniform": true,
11 | 			"dummy_data": true,
12 | 			"train": false
13 | 		}
14 |     },
15 |     {
16 | 		"arch": "resnet50",
17 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
18 | 		"num_kernels": 175,
19 | 		"num_iters": 6240,
20 | 		"args": {
21 | 			"model_name": "resnet50",
22 | 			"batchsize": 4,
23 |             "rps": 0,
24 | 			"uniform": true,
25 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
26 | 			"dummy_data": true,
27 | 			"train": false
28 | 		}
29 | 	}
30 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/ideal/mnet_inf.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
 5 | 		"num_kernels": 152,
 6 | 		"num_iters": 6240,
 7 | 		"args": {
 8 | 			"model_name": "mobilenet_v2",
 9 | 			"batchsize": 4,
10 |             "rps": 0,
11 | 			"uniform": false,
12 |             "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
13 | 			"dummy_data": true,
14 | 			"train": false
15 | 		}
16 | 	}
17 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/ideal/rnet_inf.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
 5 |         "num_kernels": 175,
 6 |         "num_iters": 6240,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 | 			"dummy_data": true,
11 |             "rps": 0,
12 | 			"uniform": false,
13 |             "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
14 | 			"train": false
15 | 		}
16 | 	}
17 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/mnet_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
 5 | 		"num_kernels": 152,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"model_name": "mobilenet_v2",
 9 | 			"batchsize": 4,
10 |             "rps": 100,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 |     {
17 | 		"arch": "mobilenet_v2",
18 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
19 | 		"num_kernels": 152,
20 | 		"num_iters": 6240,
21 | 		"args": {
22 | 			"model_name": "mobilenet_v2",
23 | 			"batchsize": 4,
24 |             "rps": 0,
25 | 			"uniform": true,
26 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/mnet_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
 5 | 		"num_kernels": 152,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"model_name": "mobilenet_v2",
 9 | 			"batchsize": 4,
10 |             "rps": 100,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 |     {
17 | 		"arch": "resnet50",
18 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
19 | 		"num_kernels": 175,
20 | 		"num_iters": 6240,
21 | 		"args": {
22 | 			"model_name": "resnet50",
23 | 			"batchsize": 4,
24 |             "rps": 0,
25 | 			"uniform": true,
26 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/mps/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | policy: "MPS" # "MPS", "TickTock", "Streams", "Isolated", or "Sequential"
 3 | models:
 4 |   model0:
 5 |     mode: eval # train or eval
 6 |     name: bert # these two names should strictly correspond to the model names below
 7 |   model1:
 8 |     mode: eval # train or eval
 9 |     name: mobilenet_v2
10 | shared_config:
11 |   distribution: trace # poisson, uniform, or trace
12 |   trace_path: '../../inter_arrival_times.json' # only used when distribution is trace
13 |   pin_memory: true
14 |   seed: 42
15 | 
16 | # configuration for each model
17 | resnet50:
18 |   arch: resnet50
19 |   batch_size: 4
20 |   num_iterations: 1000000
21 |   request_rate: 80 # measured in 1/seconds. If 0 it means no sleep
22 | resnet101:
23 |   arch: resnet101
24 |   batch_size: 4
25 |   num_iterations: 1000000
26 |   request_rate: 40 # measured in 1/seconds. If 0 it means no sleep
27 | mobilenet_v2:
28 |   arch: mobilenet_v2
29 |   batch_size: 4
30 |   num_iterations: 1000000
31 |   request_rate: 100 # measured in 1/seconds. If 0 it means no sleep
32 | bert:
33 |   batch_size: 2
34 |   arch: large # either base or large
35 |   num_iterations: 1000000
36 |   request_rate: 8 # measured in 1/seconds. If 0 it means no sleep
37 | #  large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16'
38 | #  base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12'
39 | transformer:
40 |   arch: base # either base or large
41 |   batch_size: 4
42 |   num_iterations: 1000000
43 |   request_rate: 20 # measured in 1/seconds. If 0 it means no sleep


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/mps/run.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import itertools
 3 | import logging
 4 | import os
 5 | 
 6 | mnames = {
 7 |     'resnet50': "ResNet50",
 8 |     'mobilenet_v2': "MobileNetV2",
 9 |     'resnet101': 'ResNet101',
10 |     'bert': 'BERT',
11 |     'transformer': 'Transformer'
12 | }
13 | 
14 | def run(model0, model1, config, combination_name, times=1, start_id = 0):
15 | 
16 |     config_file_name = f'gen_conf_{combination_name}.yaml'
17 | 
18 |     logging.info(f'dump config to {config_file_name}')
19 |     with open(f'./{config_file_name}', 'w') as file:
20 |         yaml.dump(config, file)
21 |     # run python main.py
22 |     logging.info(f'training with this config {times} times')
23 | 
24 | 
25 |     for i in range(start_id, start_id + times):
26 |         log_file = f'log_{i}_{combination_name}.log'
27 |         os.system(f"python3.8 {os.path.expanduser( '~' )}/orion/related/baselines/main.py --config ./{config_file_name}")
28 |         print(f"{combination_name}.log.json")
29 |         os.system(f"cp {combination_name}.log.json ../../results/mps/{mnames[model0]}_{mnames[model1]}_{i}.json")
30 | 
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     logging.basicConfig(
35 |         level=logging.INFO,
36 |         format='%(asctime)s %(levelname)-8s: [%(filename)s:%(lineno)d] %(message)s',
37 |         datefmt='%d/%m/%Y %H:%M:%S',
38 |         handlers=[
39 |             # output to console
40 |             logging.StreamHandler(),
41 |         ]
42 |     )
43 |     with open('./config.yaml', 'r') as file:
44 |         default_full_config = yaml.load(file, Loader=yaml.FullLoader)
45 | 
46 |     # ----configuration region started----
47 |     model0_mode = 'eval'
48 |     model1_mode = 'eval'
49 | 
50 |     policy = 'MPS'
51 | 
52 |     train_batch_sizes = {
53 |         'resnet50': 32,
54 |         'mobilenet_v2': 64,
55 |         'resnet101': 32,
56 |         'bert': 8,
57 |         'transformer': 8
58 |     }
59 | 
60 |     eval_batch_sizes = {
61 |         'resnet50': 4,
62 |         'mobilenet_v2': 4,
63 |         'resnet101': 4,
64 |         'bert': 2,
65 |         'transformer': 4
66 |     }
67 | 
68 | 
69 |     models = ['resnet50', 'mobilenet_v2', 'resnet101', 'bert', 'transformer']
70 |     combinations = itertools.product(models[:2], models)
71 |     times = 3
72 |     start_id = 0
73 |     distribution = 'trace'
74 | 
75 | 
76 |     # ----configuration region ended----
77 | 
78 |     default_full_config['shared_config']['distribution'] = distribution
79 | 
80 |     for model0, model1 in combinations:
81 |         default_full_config['models']['model0']['name'] = model0
82 |         default_full_config['models']['model0']['mode'] = model0_mode
83 |         default_full_config['models']['model1']['name'] = model1
84 |         default_full_config['models']['model1']['mode'] = model1_mode
85 |         default_full_config['policy'] = policy
86 | 
87 |         combination_name = f'{model0_mode}-{model0}{model1_mode}-{model1}'
88 |         run(model0, model1, default_full_config, combination_name, times=times, start_id=start_id)


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/rnet101_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet101",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_4_fwd",
 5 | 		"num_kernels": 345,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"model_name": "resnet101",
 9 | 			"batchsize": 4,
10 |             "rps": 40,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 |     {
17 | 		"arch": "mobilenet_v2",
18 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
19 | 		"num_kernels": 152,
20 | 		"num_iters": 6240,
21 | 		"args": {
22 | 			"model_name": "mobilenet_v2",
23 | 			"batchsize": 4,
24 |             "rps": 0,
25 | 			"uniform": true,
26 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/rnet101_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet101",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_4_fwd",
 5 | 		"num_kernels": 345,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"model_name": "resnet101",
 9 | 			"batchsize": 4,
10 |             "rps": 40,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 |     {
17 | 		"arch": "resnet50",
18 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
19 | 		"num_kernels": 175,
20 | 		"num_iters": 6240,
21 | 		"args": {
22 | 			"model_name": "resnet50",
23 | 			"batchsize": 4,
24 |             "rps": 0,
25 | 			"uniform": true,
26 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/rnet_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
 5 | 		"num_kernels": 175,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 |             "rps": 80,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 | 	{
17 | 		"arch": "mobilenet_v2",
18 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
19 | 		"num_kernels": 152,
20 | 		"num_iters": 6240,
21 | 		"args": {
22 | 			"model_name": "mobilenet_v2",
23 | 			"batchsize": 4,
24 |             "rps": 0,
25 | 			"uniform": true,
26 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/rnet_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
 5 | 		"num_kernels": 175,
 6 | 		"num_iters": 2500000,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 |             "rps": 80,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 | 	{
17 | 		"arch": "resnet50",
18 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
19 | 		"num_kernels": 175,
20 | 		"num_iters": 6240,
21 | 		"args": {
22 | 			"model_name": "resnet50",
23 | 			"batchsize": 4,
24 |             "rps": 0,
25 | 			"uniform": true,
26 | 			"input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/trans_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "arch": "transformer",
 4 |         "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_4_fwd",
 5 |         "num_kernels": 459,
 6 |         "num_iters": 2500000,
 7 |         "args": {
 8 |             "batchsize": 4,
 9 |             "rps": 20,
10 |             "uniform": true,
11 |             "dummy_data": true,
12 |             "train": false
13 |         }
14 |     },
15 |     {
16 | 		"arch": "mobilenet_v2",
17 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
18 | 		"num_kernels": 152,
19 | 		"num_iters": 6240,
20 | 		"args": {
21 | 			"model_name": "mobilenet_v2",
22 | 			"batchsize": 4,
23 |             "rps": 0,
24 | 			"uniform": true,
25 |             "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
26 | 			"dummy_data": true,
27 | 			"train": false
28 | 		}
29 | 	}
30 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/config_files/trans_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "arch": "transformer",
 4 |         "kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_4_fwd",
 5 |         "num_kernels": 459,
 6 |         "num_iters": 2500000,
 7 |         "args": {
 8 |             "batchsize": 4,
 9 |             "rps": 20,
10 |             "uniform": true,
11 |             "dummy_data": true,
12 |             "train": false
13 |         }
14 |     },
15 |     {
16 | 		"arch": "resnet50",
17 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
18 | 		"num_kernels": 175,
19 | 		"num_iters": 6240,
20 | 		"args": {
21 | 			"model_name": "resnet50",
22 | 			"batchsize": 4,
23 |             "rps": 0,
24 | 			"uniform": false,
25 |             "input_file": "/root/orion/artifact_evaluation/fig10/inter_arrival_times.json",
26 | 			"dummy_data": true,
27 | 			"train": false
28 | 		}
29 | 	}
30 | 
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/gather_results.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import json
 5 | import itertools
 6 | 
 7 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
 8 | baselines = ['reef', 'orion', 'mps', 'ideal']
 9 | 
10 | hp_list = ['ResNet50', 'MobileNetV2']
11 | be_list = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
12 | num_runs = 3
13 | 
14 | df_ideal = pd.DataFrame("0", index=models, columns=models)
15 | for hp in hp_list:
16 |     results = []
17 |     for run in range(num_runs):
18 |         input_file = f"results/ideal/{hp}_{run}_hp.json"
19 |         with open(input_file, 'r') as f:
20 |             data = json.load(f)
21 |             results.append(float(data['p95_latency']))
22 | 
23 |     for be in be_list:
24 |         df_ideal.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}"
25 | df_ideal.to_csv(f'results/ideal_latency.csv')
26 | print("ideal")
27 | print(df_ideal)
28 | 
29 | # mps
30 | df_mps = pd.DataFrame(0.0, index=models, columns=models)
31 | for hp in hp_list:
32 |     for be,hp in itertools.product(be_list, hp_list):
33 |         results = []
34 |         for run in range(num_runs):
35 |             input_file = f"results/mps/{hp}_{be}_{run}.json"
36 |             with open(input_file, 'r') as f:
37 |                 data = json.load(f)
38 |                 results.append(float(data['p95-latency-0']))
39 |         df_mps.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}"
40 | df_mps.to_csv(f'results/mps_latency.csv')
41 | print("mps")
42 | print(df_mps)
43 | 
44 | for baseline in baselines[:-2]:
45 |     df = pd.DataFrame("0", index=models, columns=models)
46 |     for be,hp in itertools.product(be_list, hp_list):
47 |         results = []
48 |         for run in range(num_runs):
49 |             input_file = f"results/{baseline}/{be}_{hp}_{run}_hp.json"
50 |             with open(input_file, 'r') as f:
51 |                 data = json.load(f)
52 |                 results.append(float(data['p95_latency']))
53 |         df.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}"
54 |     df.to_csv(f'results/{baseline}_latency.csv')
55 |     print(baseline)
56 |     print(df)
57 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/plot_latency.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
 6 | 
 7 | # %%
 8 | 
 9 | def get_data(csv_file, error=False):
10 |     df = pd.read_csv(csv_file)
11 |     df = df.drop(df.columns[0], axis=1)
12 |     df.index = models
13 | 
14 |     df = df.drop(df.columns[-3], axis=1)
15 |     df = df.drop(df.columns[-2], axis=1)
16 |     df = df.drop(df.columns[-1], axis=1)
17 | 
18 |     for model_row in models:
19 |         for model_col in models[:2]:
20 |             cell = df.at[model_row, model_col]
21 |             df.at[model_row, model_col] = float(cell.split('/')[0]) #float(cell.split('/')[1]) if error else float(cell.split('/')[0])
22 |     if error:
23 |         return df.std()
24 |     else:
25 |         return df.mean()
26 | 
27 | # %%
28 | method2file = {
29 |     'MPS': 'results/mps_latency.csv',
30 |     'REEF policy': 'results/reef_latency.csv',
31 |     'Orion': 'results/orion_latency.csv',
32 |     'Ideal': 'results/ideal_latency.csv'
33 | }
34 | 
35 | label_font_size = 22
36 | methods = list(method2file.keys())
37 | 
38 | method2data = {}
39 | method2err = {}
40 | 
41 | for method, file in method2file.items():
42 |     method2data[method] = get_data(file)
43 |     method2err[method] = get_data(file, error=True)
44 | 
45 | width = 0.15
46 | fig, ax = plt.subplots(figsize=(14, 8))
47 | x = np.arange(2)
48 | bars = []
49 | for method_id, method in enumerate(methods):
50 | 
51 |     bar = ax.bar(
52 |         x + width * method_id, method2data[method], width,
53 |         label=method, yerr=method2err[method],
54 |         align='edge'
55 |     )
56 |     bars.append(bar)
57 | 
58 | x_tick_positions = x + width * len(methods) / 2
59 | ax.set_xticks(
60 |     ticks=x_tick_positions,
61 |     labels=models[:2], fontsize=22
62 | )
63 | plt.yticks(fontsize=22)
64 | ax.set_ylabel('Average p95 inference latency (ms)', fontsize=label_font_size)
65 | ax.set_xlabel('High-priority inference job', fontsize=label_font_size)
66 | 
67 | plt.tight_layout()
68 | handles, labels = ax.get_legend_handles_labels()
69 | plt.legend(handles, labels, loc='upper left', ncol=1, fontsize=20)
70 | 
71 | plt.savefig("fig10.png", bbox_inches="tight")
72 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/prep_dirs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir results
4 | mkdir results/ideal
5 | mkdir results/reef
6 | mkdir results/orion
7 | mkdir results/mps


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/run_ideal.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | num_runs = 3
 5 | trace_files_hp = [
 6 |         ("ResNet50", "rnet"),
 7 |         ("MobileNetV2", "mnet"),
 8 | ]
 9 | 
10 | for (model, f) in trace_files_hp:
11 |         for run in range(num_runs):
12 |             print(model, run, flush=True)
13 |             # run
14 |             file_path = f"config_files/ideal/{f}_inf.json"
15 |             os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path}")
16 | 
17 |             # copy results
18 |             os.system(f"cp client_0.json results/ideal/{model}_{run}_hp.json")
19 |             os.system("rm client_0.json")


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/run_orion.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | num_runs = 3
 5 | trace_files = [
 6 |     ("ResNet50", "ResNet50", "rnet_rnet", 160000),
 7 |     ("ResNet50", "MobileNetV2", "rnet_mnet", 100000),
 8 |     ("MobileNetV2", "ResNet50", "mnet_rnet", 160000),
 9 |     ("MobileNetV2", "MobileNetV2", "mnet_mnet", 100000),
10 |     ("ResNet101", "ResNet50", "rnet101_rnet", 160000),
11 |     ("ResNet101", "MobileNetV2", "rnet101_mnet", 100000),
12 |     ("BERT", "ResNet50", "bert_rnet", 160000),
13 |     ("BERT", "MobileNetV2", "bert_mnet", 100000),
14 |     ("Transformer", "ResNet50", "trans_rnet", 160000),
15 |     ("Transformer", "MobileNetV2", "trans_mnet", 100000),
16 | ]
17 | 
18 | for (be, hp, f, max_be_duration) in trace_files:
19 |     for run in range(num_runs):
20 |         print(be, hp, run, flush=True)
21 |         # run
22 |         file_path = f"config_files/{f}.json"
23 |         os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path} --orion_max_be_duration {max_be_duration}")
24 | 
25 |         # copy results
26 |         os.system(f"cp client_1.json results/orion/{be}_{hp}_{run}_hp.json")
27 |         os.system("rm -rf client_1.json")
28 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig10/run_reef.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | num_runs = 3
 5 | trace_files = [
 6 |     ("ResNet50", "ResNet50", "rnet_rnet"),
 7 |     ("ResNet50", "MobileNetV2", "rnet_mnet"),
 8 |     ("MobileNetV2", "ResNet50", "mnet_rnet"),
 9 |     ("MobileNetV2", "MobileNetV2", "mnet_mnet"),
10 |     ("ResNet101", "ResNet50", "rnet101_rnet"),
11 |     ("ResNet101", "MobileNetV2", "rnet101_mnet"),
12 |     ("BERT", "ResNet50", "bert_rnet"),
13 |     ("BERT", "MobileNetV2", "bert_mnet"),
14 |     ("Transformer", "ResNet50", "trans_rnet"),
15 |     ("Transformer", "MobileNetV2", "trans_mnet"),
16 | ]
17 | 
18 | for (be, hp, f) in trace_files:
19 |     for run in range(num_runs):
20 |         print(be, hp, run, flush=True)
21 |         # run
22 |         file_path = f"config_files/{f}.json"
23 |         os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo reef --config_file {file_path} --reef_depth 12")
24 | 
25 |         # copy results
26 |         os.system(f"cp client_1.json results/reef/{be}_{hp}_{run}_hp.json")
27 |         os.system("rm client_1.json")
28 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/bert_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "bert",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1",
 6 | 		"num_kernels": 4777,
 7 | 		"additional_num_kernels": 4777,
 8 | 		"num_iters": 1200000,
 9 | 		"args": {
10 | 			"batchsize": 8,
11 | 			"rps": 0,
12 | 			"uniform": false,
13 | 			"dummy_data": false,
14 | 			"train": true
15 | 		}
16 | 	},
17 |     {
18 | 		"arch": "mobilenet_v2",
19 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
20 | 		"num_kernels": 152,
21 | 		"num_iters": 12000,
22 | 		"args": {
23 | 			"model_name": "mobilenet_v2",
24 | 			"batchsize": 4,
25 |             "rps": 40,
26 | 			"uniform": false,
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/bert_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "bert",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1",
 6 | 		"num_kernels": 4777,
 7 | 		"additional_num_kernels": 4777,
 8 | 		"num_iters": 550000,
 9 | 		"args": {
10 | 			"batchsize": 8,
11 | 			"rps": 0,
12 | 			"uniform": false,
13 | 			"dummy_data": false,
14 | 			"train": true
15 | 		}
16 | 	},
17 |     {
18 | 		"arch": "resnet50",
19 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
20 |         "num_kernels": 175,
21 |         "num_iters": 9200,
22 | 		"args": {
23 | 			"model_name": "resnet50",
24 | 			"batchsize": 4,
25 |             "rps": 15,
26 | 			"uniform": false,
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/bert_train.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "bert",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1",
 6 | 		"num_kernels": 4777,
 7 | 		"additional_num_kernels": 4777,
 8 | 		"num_iters": 200,
 9 | 		"args": {
10 | 			"batchsize": 8,
11 | 			"rps": 0,
12 | 			"uniform": false,
13 | 			"dummy_data": false,
14 | 			"train": true
15 | 		}
16 | 	}
17 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/mnet_inf.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
 5 | 		"num_kernels": 152,
 6 | 		"num_iters": 12000,
 7 | 		"args": {
 8 | 			"model_name": "mobilenet_v2",
 9 | 			"batchsize": 4,
10 |             "rps": 40,
11 | 			"uniform": false,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	}
16 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/mnet_train.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb1",
 6 | 		"num_kernels": 574,
 7 | 		"additional_num_kernels": 890,
 8 | 		"num_iters": 200,
 9 | 		"args": {
10 | 			"model_name": "mobilenet_v2",
11 | 			"batchsize": 64,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	}
18 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/rnet101_train.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet101",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb1",
 6 |         "num_kernels": 1219,
 7 |         "additional_num_kernels": 1847,
 8 |         "num_iters": 200,
 9 | 		"args": {
10 | 			"model_name": "resnet101",
11 | 			"batchsize": 32,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	}
18 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/rnet_inf.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
 5 |         "num_kernels": 175,
 6 |         "num_iters": 9200,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 |             "rps": 15,
11 | 			"uniform": false,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	}
16 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/rnet_train.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb1",
 6 |         "num_kernels": 624,
 7 |         "additional_num_kernels": 946,
 8 |         "num_iters": 200,
 9 | 		"args": {
10 | 			"model_name": "resnet50",
11 | 			"batchsize": 32,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	}
18 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/ideal/trans_train.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "transformer",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1",
 6 | 		"num_kernels": 4396,
 7 | 		"additional_num_kernels": 4354,
 8 | 		"num_iters": 200,
 9 | 		"args": {
10 | 			"batchsize": 8,
11 | 			"rps": 0,
12 | 			"uniform": false,
13 | 			"dummy_data": true,
14 | 			"train": true
15 | 		}
16 | 	}
17 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/mnet_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb1",
 6 | 		"num_kernels": 574,
 7 | 		"additional_num_kernels": 890,
 8 | 		"num_iters": 1200000,
 9 | 		"args": {
10 | 			"model_name": "mobilenet_v2",
11 | 			"batchsize": 64,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	},
18 | 	{
19 | 		"arch": "mobilenet_v2",
20 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
21 | 		"num_kernels": 152,
22 | 		"num_iters": 12000,
23 | 		"args": {
24 | 			"model_name": "mobilenet_v2",
25 | 			"batchsize": 4,
26 |             "rps": 40,
27 | 			"uniform": false,
28 | 			"dummy_data": true,
29 | 			"train": false
30 | 		}
31 | 	}
32 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/mnet_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "mobilenet_v2",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_64_fb1",
 6 | 		"num_kernels": 574,
 7 | 		"additional_num_kernels": 890,
 8 | 		"num_iters": 920000,
 9 | 		"args": {
10 | 			"model_name": "mobilenet_v2",
11 | 			"batchsize": 64,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	},
18 |     {
19 | 		"arch": "resnet50",
20 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
21 |         "num_kernels": 175,
22 |         "num_iters": 9200,
23 | 		"args": {
24 | 			"model_name": "resnet50",
25 | 			"batchsize": 4,
26 |             "rps": 15,
27 | 			"uniform": false,
28 | 			"dummy_data": true,
29 | 			"train": false
30 | 		}
31 | 	}
32 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/mps/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | policy: "MPS" # "MPS", "TickTock", "Streams", "Isolated", or "Sequential"
 3 | models:
 4 |   model0:
 5 |     mode: eval # train or eval
 6 |     name: bert # these two names should strictly correspond to the model names below
 7 |   model1:
 8 |     mode: eval # train or eval
 9 |     name: mobilenet_v2
10 | shared_config:
11 |   distribution: poisson # poisson, uniform, or trace
12 |   trace_path: './inter_arrival_times.json' # only used when distribution is trace
13 |   pin_memory: true
14 |   seed: 42
15 | 
16 | # configuration for each model
17 | resnet50:
18 |   arch: resnet50
19 |   batch_size: 4
20 |   num_iterations: 1000000
21 |   request_rate: 15 # measured in 1/seconds. If 0 it means no sleep
22 | resnet101:
23 |   arch: resnet101
24 |   batch_size: 32
25 |   num_iterations: 1000000
26 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
27 | mobilenet_v2:
28 |   arch: mobilenet_v2
29 |   batch_size: 4
30 |   num_iterations: 1000000
31 |   request_rate: 40 # measured in 1/seconds. If 0 it means no sleep
32 | bert:
33 |   batch_size: 8
34 |   arch: base # either base or large
35 |   num_iterations: 1000000
36 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
37 | #  large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16'
38 | #  base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12'
39 | transformer:
40 |   arch: base # either base or large
41 |   batch_size: 8
42 |   num_iterations: 1000000
43 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
44 | 
45 | resnet50-1:
46 |   arch: resnet50
47 |   batch_size: 32
48 |   num_iterations: 1000000
49 |   request_rate: 80 # measured in 1/seconds. If 0 it means no sleep
50 | mobilenet_v2-1:
51 |   arch: mobilenet_v2
52 |   batch_size: 64
53 |   num_iterations: 1000000
54 |   request_rate: 100 # measured in 1/seconds. If 0 it means no sleep
55 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/mps/eval-resnet50train-resnet50-1.log.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "duration-1": 2.927621603012085,
 3 |     "iterations-1": 38,
 4 |     "throughput-1": 9.222503335889117,
 5 |     "duration": 2.9278297424316406,
 6 |     "p50-latency-0": 64.3700361251831,
 7 |     "throughput-0": 32.203886399001064,
 8 |     "p90-latency-0": 142.8278207778931,
 9 |     "p95-latency-0": 156.5918684005737,
10 |     "p99-latency-0": 181.01235866546628
11 | }


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/mps/gen_conf_eval-resnet50train-resnet50.yaml:
--------------------------------------------------------------------------------
 1 | bert:
 2 |   arch: base
 3 |   batch_size: 8
 4 |   num_iterations: 10000
 5 |   request_rate: 0
 6 | mobilenet_v2:
 7 |   arch: mobilenet_v2
 8 |   batch_size: 4
 9 |   num_iterations: 10000
10 |   request_rate: 40
11 | mobilenet_v2-1:
12 |   arch: mobilenet_v2
13 |   batch_size: 64
14 |   num_iterations: 10000
15 |   request_rate: 100
16 | models:
17 |   model0:
18 |     mode: eval
19 |     name: resnet50
20 |   model1:
21 |     mode: train
22 |     name: resnet50-1
23 | policy: MPS
24 | resnet101:
25 |   arch: resnet101
26 |   batch_size: 32
27 |   num_iterations: 10000
28 |   request_rate: 0
29 | resnet50:
30 |   arch: resnet50
31 |   batch_size: 4
32 |   num_iterations: 100
33 |   request_rate: 30
34 | resnet50-1:
35 |   arch: resnet50
36 |   batch_size: 32
37 |   num_iterations: 10000
38 |   request_rate: 80
39 | shared_config:
40 |   distribution: poisson
41 |   pin_memory: true
42 |   seed: 42
43 |   trace_path: ./inter_arrival_times.json
44 | transformer:
45 |   arch: base
46 |   batch_size: 8
47 |   num_iterations: 10000
48 |   request_rate: 0
49 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/mps/run.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import itertools
  3 | import logging
  4 | import os
  5 | 
  6 | mnames = {
  7 |     'resnet50': "ResNet50",
  8 |     'mobilenet_v2': "MobileNetV2",
  9 |     'resnet101': 'ResNet101',
 10 |     'bert': 'BERT',
 11 |     'transformer': 'Transformer'
 12 | }
 13 | 
 14 | def run(model0, model1, config, combination_name, times=1, start_id = 0):
 15 | 
 16 |     config_file_name = f'gen_conf_{combination_name}.yaml'
 17 | 
 18 |     logging.info(f'dump config to {config_file_name}')
 19 |     with open(f'./{config_file_name}', 'w') as file:
 20 |         yaml.dump(config, file)
 21 |     # run python main.py
 22 |     logging.info(f'training with this config {times} times')
 23 | 
 24 | 
 25 |     for i in range(start_id, start_id + times):
 26 |         log_file = f'log_{i}_{combination_name}.log'
 27 |         os.system(f"python3.8 {os.path.expanduser( '~' )}/orion/related/baselines/main.py --config ./{config_file_name}")
 28 |         print(f"{combination_name}.log.json")
 29 |         os.system(f"cp {combination_name}.log.json ../../results/mps/{mnames[model0]}_{mnames[model1]}_{i}.json")
 30 | 
 31 | 
 32 | 
 33 | if __name__ == "__main__":
 34 |     logging.basicConfig(
 35 |         level=logging.INFO,
 36 |         format='%(asctime)s %(levelname)-8s: [%(filename)s:%(lineno)d] %(message)s',
 37 |         datefmt='%d/%m/%Y %H:%M:%S',
 38 |         handlers=[
 39 |             # output to console
 40 |             logging.StreamHandler(),
 41 |         ]
 42 |     )
 43 |     with open('./config.yaml', 'r') as file:
 44 |         default_full_config = yaml.load(file, Loader=yaml.FullLoader)
 45 | 
 46 |     # ----configuration region started----
 47 |     model0_mode = 'eval'
 48 |     model1_mode = 'train'
 49 | 
 50 |     policy = 'MPS'
 51 | 
 52 |     train_batch_sizes = {
 53 |         'resnet50': 32,
 54 |         'mobilenet_v2': 64,
 55 |         'resnet101': 32,
 56 |         'bert': 8,
 57 |         'transformer': 8
 58 |     }
 59 | 
 60 |     eval_batch_sizes = {
 61 |         'resnet50': 4,
 62 |         'mobilenet_v2': 4,
 63 |         'resnet101': 4,
 64 |         'bert': 2,
 65 |         'transformer': 4
 66 |     }
 67 | 
 68 |     request_rates = {
 69 |         'resnet50': 15,
 70 |         'mobilenet_v2': 40,
 71 |     }
 72 | 
 73 |     num_reqs = {
 74 |         'resnet50': 9200,
 75 |         'mobilenet_v2': 12000,
 76 |     }
 77 | 
 78 |     models = ['resnet50', 'mobilenet_v2', 'resnet101', 'bert', 'transformer']
 79 |     combinations = itertools.product(models[:2], models)
 80 |     times = 3
 81 |     start_id = 0
 82 |     distribution = 'poisson'
 83 | 
 84 | 
 85 |     # ----configuration region ended----
 86 | 
 87 |     default_full_config['shared_config']['distribution'] = distribution
 88 | 
 89 |     for model0, model1 in combinations:
 90 |         default_full_config['models']['model0']['name'] = model0
 91 |         default_full_config['models']['model0']['mode'] = model0_mode
 92 |         default_full_config['models']['model1']['name'] = model1 if model0 != model1 else model1 + '-1'
 93 |         default_full_config['models']['model1']['mode'] = model1_mode
 94 |         default_full_config['policy'] = policy
 95 | 
 96 |         if model0 != model1:
 97 | 
 98 |             default_full_config[model0]['request_rate'] = request_rates[model0]
 99 |             default_full_config[model0]['num_iterations'] = num_reqs[model0]
100 | 
101 |             default_full_config[model0]['batch_size'] = eval_batch_sizes[model0]
102 |             default_full_config[model1]['batch_size'] = train_batch_sizes[model1]
103 | 
104 |             combination_name = f'{model0_mode}-{model0}{model1_mode}-{model1}'
105 |             run(model0, model1, default_full_config, combination_name, times=times, start_id=start_id)
106 |         else:
107 |             model1_with_suffix = model1 + '-1'
108 | 
109 |             default_full_config[model0]['request_rate'] = request_rates[model0]
110 |             default_full_config[model0]['num_iterations'] = num_reqs[model0]
111 | 
112 |             default_full_config[model0]['batch_size'] = eval_batch_sizes[model0]
113 |             default_full_config[model1_with_suffix]['batch_size'] = train_batch_sizes[model1]
114 | 
115 |             combination_name = f'{model0_mode}-{model0}{model1_mode}-{model1_with_suffix}'
116 |             run(model0, model1, default_full_config, combination_name, times=times, start_id=start_id)


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/rnet101_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet101",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb1",
 6 |         "num_kernels": 1219,
 7 |         "additional_num_kernels": 1847,
 8 |         "num_iters": 1200000,
 9 | 		"args": {
10 | 			"model_name": "resnet101",
11 | 			"batchsize": 32,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	},
18 |     {
19 | 		"arch": "mobilenet_v2",
20 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
21 | 		"num_kernels": 152,
22 | 		"num_iters": 12000,
23 | 		"args": {
24 | 			"model_name": "mobilenet_v2",
25 | 			"batchsize": 4,
26 |             "rps": 40,
27 | 			"uniform": false,
28 | 			"dummy_data": true,
29 | 			"train": false
30 | 		}
31 | 	}
32 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/rnet101_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet101",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet101_32_fb1",
 6 |         "num_kernels": 1219,
 7 |         "additional_num_kernels": 1847,
 8 |         "num_iters": 920000,
 9 | 		"args": {
10 | 			"model_name": "resnet101",
11 | 			"batchsize": 32,
12 |             "rps": 0,
13 | 			"uniform": true,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	},
18 | 	{
19 | 		"arch": "resnet50",
20 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
21 |         "num_kernels": 175,
22 |         "num_iters": 9200,
23 | 		"args": {
24 | 			"model_name": "resnet50",
25 | 			"batchsize": 4,
26 |             "rps": 15,
27 | 			"uniform": false,
28 | 			"dummy_data": true,
29 | 			"train": false
30 | 		}
31 | 	}
32 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/rnet_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb1",
 6 |         "num_kernels": 624,
 7 |         "additional_num_kernels": 946,
 8 |         "num_iters": 1200000,
 9 | 		"args": {
10 | 			"model_name": "resnet50",
11 | 			"batchsize": 32,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	},
18 |     {
19 | 		"arch": "mobilenet_v2",
20 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
21 | 		"num_kernels": 152,
22 | 		"num_iters": 12000,
23 | 		"args": {
24 | 			"model_name": "mobilenet_v2",
25 | 			"batchsize": 4,
26 |             "rps": 40,
27 | 			"uniform": false,
28 | 			"dummy_data": true,
29 | 			"train": false
30 | 		}
31 | 	}
32 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/rnet_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_32_fb1",
 6 |         "num_kernels": 624,
 7 |         "additional_num_kernels": 946,
 8 |         "num_iters": 550000,
 9 | 		"args": {
10 | 			"model_name": "resnet50",
11 | 			"batchsize": 32,
12 |             "rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": true,
15 | 			"train": true
16 | 		}
17 | 	},
18 | 	{
19 | 		"arch": "resnet50",
20 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
21 |         "num_kernels": 175,
22 |         "num_iters": 9200,
23 | 		"args": {
24 | 			"model_name": "resnet50",
25 | 			"batchsize": 4,
26 |             "rps": 15,
27 | 			"uniform": false,
28 | 			"dummy_data": true,
29 | 			"train": false
30 | 		}
31 | 	}
32 | ]
33 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/trans_mnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "transformer",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1",
 6 | 		"num_kernels": 4396,
 7 | 		"additional_num_kernels": 4354,
 8 | 		"num_iters": 1200000,
 9 | 		"args": {
10 | 			"batchsize": 8,
11 | 			"rps": 0,
12 | 			"uniform": false,
13 | 			"dummy_data": true,
14 | 			"train": true
15 | 		}
16 | 	},
17 |     {
18 | 		"arch": "mobilenet_v2",
19 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/mobilenetv2_4_fwd",
20 | 		"num_kernels": 152,
21 | 		"num_iters": 12000,
22 | 		"args": {
23 | 			"model_name": "mobilenet_v2",
24 | 			"batchsize": 4,
25 |             "rps": 40,
26 | 			"uniform": false,
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/config_files/trans_rnet.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 | 		"arch": "transformer",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0",
 5 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1",
 6 | 		"num_kernels": 4396,
 7 | 		"additional_num_kernels": 4354,
 8 | 		"num_iters": 550000,
 9 | 		"args": {
10 | 			"batchsize": 8,
11 | 			"rps": 0,
12 | 			"uniform": false,
13 | 			"dummy_data": true,
14 | 			"train": true
15 | 		}
16 | 	},
17 |     {
18 | 		"arch": "resnet50",
19 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
20 |         "num_kernels": 175,
21 |         "num_iters": 9200,
22 | 		"args": {
23 | 			"model_name": "resnet50",
24 | 			"batchsize": 4,
25 |             "rps": 15,
26 | 			"uniform": false,
27 | 			"dummy_data": true,
28 | 			"train": false
29 | 		}
30 | 	}
31 | ]


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/gather_latency.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import json
 5 | import itertools
 6 | 
 7 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
 8 | baselines = ['reef', 'orion', 'mps', 'ideal']
 9 | 
10 | hp_list = ['ResNet50', 'MobileNetV2']
11 | be_list = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
12 | num_runs = 3
13 | 
14 | # ideal
15 | df_ideal = pd.DataFrame(0.0, index=models, columns=models)
16 | for hp in hp_list:
17 |     results = []
18 |     for run in range(num_runs):
19 |         input_file = f"results/ideal/{hp}_{run}_hp.json"
20 |         with open(input_file, 'r') as f:
21 |             data = json.load(f)
22 |             results.append(float(data['p95_latency']))
23 |     print(hp, results)
24 |     for be in be_list:
25 |         df_ideal.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}"
26 | df_ideal.to_csv(f'results/ideal_latency.csv')
27 | print(df_ideal)
28 | 
29 | # mps
30 | df_mps = pd.DataFrame(0.0, index=models, columns=models)
31 | for be,hp in itertools.product(be_list, hp_list):
32 |     results = []
33 |     for run in range(num_runs):
34 |         input_file = f"results/mps/{hp}_{be}_{run}.json"
35 |         with open(input_file, 'r') as f:
36 |             data = json.load(f)
37 |             results.append(float(data['p95-latency-0']))
38 |     df_mps.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}"
39 | df_mps.to_csv(f'results/mps_latency.csv')
40 | print(df_mps)
41 | 
42 | # get rest baselines
43 | for baseline in baselines[:-2]:
44 |     df = pd.DataFrame(0.0, index=models, columns=models)
45 |     for be,hp in itertools.product(be_list, hp_list):
46 |         results = []
47 |         for run in range(num_runs):
48 |             input_file = f"results/{baseline}/{be}_{hp}_{run}_hp.json"
49 |             with open(input_file, 'r') as f:
50 |                 data = json.load(f)
51 |                 results.append(float(data['p95_latency']))
52 |         df.at[be, hp] = f"{round(np.average(results),2)}/{round(np.std(results),2)}"
53 |     df.to_csv(f'results/{baseline}_latency.csv')
54 |     print(baseline, df)
55 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/gather_throughput.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import json
 5 | import itertools
 6 | 
 7 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
 8 | baselines = ['reef', 'orion', 'mps', 'ideal']
 9 | 
10 | hp_list = ['ResNet50', 'MobileNetV2']
11 | be_list = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
12 | num_runs = 3
13 | 
14 | df_hp_ideal_throughput = pd.DataFrame("0", index=models, columns=models)
15 | df_be_ideal_throughput = pd.DataFrame("0", index=models, columns=models)
16 | for hp in hp_list:
17 |     res_hp = []
18 |     for run in range(num_runs):
19 |         input_file_hp = f"results/ideal/{hp}_{run}_hp.json"
20 |         with open(input_file_hp, 'r') as f:
21 |             data = json.load(f)
22 |             res_hp.append(float(data['throughput']))
23 |     for be in be_list:
24 |         print(round(np.average(res_hp),2))
25 |         df_hp_ideal_throughput.at[be, hp] = f"{round(np.average(res_hp),2)}/{round(np.std(res_hp),2)}"
26 | 
27 | for be in be_list:
28 |     res_be = []
29 |     for run in range(num_runs):
30 |         input_file_be = f"results/ideal/{be}_{run}_be.json"
31 |         with open(input_file_be, 'r') as f:
32 |             data = json.load(f)
33 |             res_be.append(float(data['throughput']))
34 |     for hp in hp_list:
35 |         df_be_ideal_throughput.at[be, hp] = f"{round(np.average(res_be),2)}/{round(np.std(res_be),2)}"
36 | 
37 | df_hp_ideal_throughput.to_csv(f'results/inf_throughput_ideal.csv')
38 | df_be_ideal_throughput.to_csv(f'results/train_throughput_ideal.csv')
39 | print("ideal")
40 | print(df_hp_ideal_throughput)
41 | print(df_be_ideal_throughput)
42 | 
43 | df_hp_mps_throughput = pd.DataFrame("0", index=models, columns=models)
44 | df_be_mps_throughput = pd.DataFrame("0", index=models, columns=models)
45 | for be,hp in itertools.product(be_list, hp_list):
46 |     res_hp = []
47 |     res_be = []
48 |     for run in range(num_runs):
49 |         input_file_hp = f"results/mps/{hp}_{be}_{run}.json"
50 |         with open(input_file_hp, 'r') as f:
51 |             data = json.load(f)
52 |             res_be.append(float(data['throughput-1']))
53 |             res_hp.append(float(data['throughput-0']))
54 | 
55 |     df_hp_mps_throughput.at[be, hp] = f"{round(np.average(res_hp),2)}/{round(np.std(res_hp),2)}"
56 |     df_be_mps_throughput.at[be, hp] = f"{round(np.average(res_be),2)}/{round(np.std(res_be),2)}"
57 | 
58 | df_hp_mps_throughput.to_csv(f'results/inf_throughput_mps.csv')
59 | df_be_mps_throughput.to_csv(f'results/train_throughput_mps.csv')
60 | print("mps")
61 | print(df_hp_mps_throughput)
62 | print(df_be_mps_throughput)
63 | 
64 | for baseline in baselines[:-2]:
65 |     df_hp_throughput = pd.DataFrame("0", index=models, columns=models)
66 |     df_be_throughput = pd.DataFrame("0", index=models, columns=models)
67 |     for be,hp in itertools.product(be_list, hp_list):
68 |         res_hp = []
69 |         res_be = []
70 |         for run in range(num_runs):
71 |             input_file_hp = f"results/{baseline}/{be}_{hp}_{run}_hp.json"
72 |             with open(input_file_hp, 'r') as f:
73 |                 data = json.load(f)
74 |                 res_hp.append(float(data['throughput']))
75 | 
76 |             input_file_be = f"results/{baseline}/{be}_{hp}_{run}_be.json"
77 |             with open(input_file_be, 'r') as f:
78 |                 data = json.load(f)
79 |                 res_be.append(float(data['throughput']))
80 | 
81 |         df_hp_throughput.at[be, hp] = f"{round(np.average(res_hp),2)}/{round(np.std(res_hp),2)}"
82 |         df_be_throughput.at[be, hp] = f"{round(np.average(res_be),2)}/{round(np.std(res_be),2)}"
83 | 
84 |     print(baseline)
85 |     print(df_hp_throughput)
86 |     print(df_be_throughput)
87 | 
88 |     df_hp_throughput.to_csv(f'results/inf_throughput_{baseline}.csv')
89 |     df_be_throughput.to_csv(f'results/train_throughput_{baseline}.csv')
90 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/plot_latency.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
 6 | model2id = {
 7 |     'ResNet50': 0,
 8 |     'MobileNetV2': 1,
 9 |     'ResNet101': 2,
10 |     'BERT': 3,
11 |     'Transformer': 4
12 | }
13 | 
14 | # %%
15 | 
16 | def get_data(csv_file, error=False):
17 |     df = pd.read_csv(csv_file)
18 |     df = df.drop(df.columns[0], axis=1)
19 |     df.index = models
20 | 
21 |     for model_row in models:
22 |         for model_col in models[:2]:
23 |             cell = df.at[model_row, model_col]
24 |             df.at[model_row, model_col] = float(cell.split('/')[0]) #float(cell.split('/')[1]) if error else float(cell.split('/')[0])
25 |     if not error:
26 |         return df.mean()
27 | 
28 |     df = df.std()
29 |     return df
30 | 
31 | 
32 | 
33 | # %%
34 | method2file = {
35 |     #'Temporal Sharing': 'results/latency/sequential.csv',
36 |     #'Streams': 'results/latency/streams.csv',
37 |     'MPS': 'results/mps_latency.csv',
38 |     'REEF policy': 'results/reef_latency.csv',
39 |     'Orion': 'results/orion_latency.csv',
40 |     'Ideal': 'results/ideal_latency.csv'
41 | }
42 | 
43 | label_font_size = 22
44 | methods = list(method2file.keys())
45 | 
46 | method2data = {}
47 | method2err = {}
48 | 
49 | for method, file in method2file.items():
50 |     method2data[method] = get_data(file)
51 |     method2err[method] = get_data(file, error=True)
52 | 
53 | width = 0.15
54 | fig, ax = plt.subplots(figsize=(14, 8))
55 | x = np.arange(len(models[:2]))
56 | bars = []
57 | for method_id, method in enumerate(methods):
58 | 
59 |     print(x,method2data[method])
60 |     bar = ax.bar(
61 |         x + width * method_id, method2data[method][:2], width,
62 |         label=method, yerr=method2err[method][:2],
63 |         align='edge'
64 |     )
65 |     bars.append(bar)
66 | 
67 | #for i,r in enumerate(bars[0]):
68 |     #plt.text(r.get_x() + r.get_width()/2.0, 300, f"{method2data['Temporal Sharing'][i]:.0f}", ha='center', va='bottom', fontsize=13)
69 |     #print(r.get_height())
70 | 
71 | x_tick_positions = x + width * len(methods) / 2
72 | ax.set_xticks(
73 |     ticks=x_tick_positions,
74 |     labels=models[:2], fontsize=22
75 | )
76 | plt.yticks(fontsize=22)
77 | #ax.set_ylim(0,300)
78 | ax.set_ylabel('Average p95 inference latency (ms)', fontsize=label_font_size)
79 | ax.set_xlabel('High-priority inference job', fontsize=label_font_size)
80 | 
81 | plt.tight_layout()
82 | handles, labels = ax.get_legend_handles_labels()
83 | fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(1.0, 1.08),ncols=6, fontsize=18)
84 | 
85 | #plt.show()
86 | plt.savefig("fig7a.png", bbox_inches="tight")
87 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/plot_throughput.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | models = ['ResNet50', 'MobileNetV2', 'ResNet101', 'BERT', 'Transformer']
 6 | model2id = {
 7 |     'ResNet50': 0,
 8 |     'MobileNetV2': 1,
 9 |     'ResNet101': 2,
10 |     'BERT': 3,
11 |     'Transformer': 4
12 | }
13 | 
14 | # %%
15 | 
16 | def get_data(csv_files, error=False):
17 | 
18 |     df_train_input = pd.read_csv(csv_files[0])
19 |     df_train_input = df_train_input.drop(df_train_input.columns[0], axis=1)
20 |     df_train_input.index = models
21 | 
22 |     df_inf_input = pd.read_csv(csv_files[1])
23 |     df_inf_input = df_inf_input.drop(df_inf_input.columns[0], axis=1)
24 |     df_inf_input.index = models
25 | 
26 |     df_train = pd.DataFrame()
27 |     df_inf_new = pd.DataFrame()
28 | 
29 |     for model_row in models:
30 |         for model_col in models[:2]:
31 |             cell_train = df_train_input.at[model_row, model_col]
32 |             cell_inf = df_inf_input.at[model_row, model_col]
33 | 
34 |             df_train.at[model_row, model_col] = float(cell_train.split('/')[0])
35 |             df_inf_new.at[model_row, model_col] = float(cell_inf.split('/')[0])
36 |     if error:
37 |         return df_train.std(), df_inf_new.std()
38 |     else:
39 |         return df_train.mean(), df_inf_new.mean()
40 | 
41 | # %%
42 | method2file = {
43 |     'MPS': ['results/train_throughput_mps.csv', 'results/inf_throughput_mps.csv'],
44 |     'REEF policy': ['results/train_throughput_reef.csv', 'results/inf_throughput_reef.csv'],
45 |     'Orion': ['results/train_throughput_orion.csv', 'results/inf_throughput_orion.csv'],
46 |     'Ideal': ['results/train_throughput_ideal.csv', 'results/inf_throughput_ideal.csv'],
47 | }
48 | 
49 | label_font_size = 22
50 | methods = list(method2file.keys())
51 | 
52 | method2data = {}
53 | method2err = {}
54 | 
55 | for method, file in method2file.items():
56 |     method2data[method] = get_data(file)
57 |     method2err[method] = get_data(file, error=True)
58 | 
59 | width = 0.15
60 | fig, ax = plt.subplots(figsize=(14, 8))
61 | x = np.arange(len(models[:2]))
62 | colors = ["royalblue", "darkorange", "green", "red", "mediumpurple", "saddlebrown"]
63 | 
64 | for method_id, method in enumerate(methods):
65 | 
66 |     ax.bar(
67 |          x + width * method_id, method2data[method][1][:2], width, yerr=method2err[method][1][:2],
68 |         align='edge', hatch="\\", color = colors[method_id],
69 |     )
70 |     ax.bar(
71 |         x + width * method_id, method2data[method][0][:2], width,
72 |         label=method, yerr=method2err[method][0][:2], bottom=method2data[method][1][:2],
73 |         align='edge', hatch="/", color = colors[method_id], alpha=0.6
74 |     )
75 | 
76 | x_tick_positions = x + width * len(methods) / 2
77 | ax.set_xticks(
78 |     ticks=x_tick_positions,
79 |     labels=models[:2], fontsize=22
80 | )
81 | plt.yticks(fontsize=22)
82 | ax.set_ylabel('Total Throughput (requests/sec)', fontsize=label_font_size)
83 | ax.set_xlabel('High-priority Inference job', fontsize=label_font_size)
84 | 
85 | plt.tight_layout()
86 | handles, labels = ax.get_legend_handles_labels()
87 | fig.legend(handles, labels, loc='upper right', prop={'size': 20}, borderaxespad=2)
88 | 
89 | #plt.show()
90 | plt.savefig("fig7b.png", bbox_inches="tight")
91 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/prep_dirs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir results
4 | mkdir results/ideal
5 | mkdir results/reef
6 | mkdir results/orion
7 | mkdir results/mps


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/run_ideal.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | num_runs = 3
 5 | trace_files_hp = [
 6 |     ("ResNet50", "rnet"),
 7 |     ("MobileNetV2", "mnet"),
 8 | ]
 9 | 
10 | trace_files_be = [
11 |     ("ResNet50", "rnet"),
12 |     ("MobileNetV2", "mnet"),
13 |     ("ResNet101", "rnet101"),
14 |     ("BERT", "bert"),
15 |     ("Transformer", "trans")
16 | ]
17 | 
18 | for (model, f) in trace_files_hp:
19 |     for run in range(num_runs):
20 |         print(model, run, flush=True)
21 |         # run
22 |         file_path = f"config_files/ideal/{f}_inf.json"
23 |         os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path}")
24 | 
25 |         # copy results
26 |         os.system(f"cp client_0.json results/ideal/{model}_{run}_hp.json")
27 |         os.system("rm client_0.json")
28 | 
29 | for (model, f) in trace_files_be:
30 |     for run in range(num_runs):
31 |         print(model, run, flush=True)
32 |         # run
33 |         file_path = f"config_files/ideal/{f}_train.json"
34 |         os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path}")
35 | 
36 |         # copy results
37 |         os.system(f"cp client_0.json results/ideal/{model}_{run}_be.json")
38 |         os.system("rm client_0.json")
39 | 


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/run_orion.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | num_runs = 3
 5 | trace_files = [
 6 |     ("ResNet50", "ResNet50", "rnet_rnet", 160000),
 7 |     ("ResNet50", "MobileNetV2", "rnet_mnet", 100000),
 8 |     ("MobileNetV2", "ResNet50", "mnet_rnet", 160000),
 9 |     ("MobileNetV2", "MobileNetV2", "mnet_mnet", 100000),
10 |     ("ResNet101", "ResNet50", "rnet101_rnet", 160000),
11 |     ("ResNet101", "MobileNetV2", "rnet101_mnet", 100000),
12 |     ("BERT", "ResNet50", "bert_rnet", 160000),
13 |     ("BERT", "MobileNetV2", "bert_mnet", 100000),
14 |     ("Transformer", "ResNet50", "trans_rnet", 160000),
15 |     ("Transformer", "MobileNetV2", "trans_mnet", 100000),
16 | ]
17 | 
18 | for (be, hp, f, max_be_duration) in trace_files:
19 |     for run in range(num_runs):
20 |         print(be, hp, run, flush=True)
21 |         # run
22 |         file_path = f"config_files/{f}.json"
23 |         os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo orion --config_file {file_path} --orion_max_be_duration {max_be_duration}")
24 | 
25 |         # copy results
26 |         os.system(f"cp client_1.json results/orion/{be}_{hp}_{run}_hp.json")
27 |         os.system(f"cp client_0.json results/orion/{be}_{hp}_{run}_be.json")
28 | 
29 |         os.system("rm client_1.json")
30 |         os.system("rm client_0.json")


--------------------------------------------------------------------------------
/artifact_evaluation/fig7/run_reef.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | num_runs = 3
 5 | trace_files = [
 6 |     ("ResNet50", "ResNet50", "rnet_rnet"),
 7 |     ("ResNet50", "MobileNetV2", "rnet_mnet"),
 8 |     ("MobileNetV2", "ResNet50", "mnet_rnet"),
 9 |     ("MobileNetV2", "MobileNetV2", "mnet_mnet"),
10 |     ("ResNet101", "ResNet50", "rnet101_rnet"),
11 |     ("ResNet101", "MobileNetV2", "rnet101_mnet"),
12 |     ("BERT", "ResNet50", "bert_rnet"),
13 |     ("BERT", "MobileNetV2", "bert_mnet"),
14 |     ("Transformer", "ResNet50", "trans_rnet"),
15 |     ("Transformer", "MobileNetV2", "trans_mnet"),
16 | ]
17 | 
18 | for (be, hp, f) in trace_files:
19 |     for run in range(num_runs):
20 |         print(be, hp, run, flush=True)
21 |         # run
22 |         file_path = f"config_files/{f}.json"
23 |         os.system(f"LD_PRELOAD='{os.path.expanduser( '~' )}/orion/src/cuda_capture/libinttemp.so' python3.8 ../../benchmarking/launch_jobs.py --algo reef --config_file {file_path} --reef_depth 12")
24 | 
25 |         # copy results
26 |         os.system(f"cp client_1.json results/reef/{be}_{hp}_{run}_hp.json")
27 |         os.system(f"cp client_0.json results/reef/{be}_{hp}_{run}_be.json")
28 | 
29 |         os.system("rm client_1.json")
30 |         os.system("rm client_0.json")
31 | 


--------------------------------------------------------------------------------
/benchmarking/be.json:
--------------------------------------------------------------------------------
1 | {"throughput": 4.434374429312142}


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/compute_optimal.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | ifile = sys.argv[1]
 4 | dur_total = 0
 5 | 
 6 | with open(ifile, 'r') as f:
 7 |     lines = f.readlines()
 8 |     for l in lines[1:]:
 9 |         tokens = l.split(",")
10 |         sms_used = int(tokens[-2])
11 |         dur = float(tokens[-1])/1000
12 |         if (sms_used>80):
13 |             dur_total += dur
14 | 
15 | dur_total_ms = dur_total/1000
16 | print(dur_total_ms*2)
17 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/conv_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | from torchvision import models, datasets, transforms
 9 | from torch.nn.parallel import DistributedDataParallel as DDP
10 | import torch.nn.functional as F
11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
12 | from datetime import timedelta
13 | import random
14 | import numpy as np
15 | import time
16 | import os
17 | import argparse
18 | import threading
19 | 
20 | class Model(torch.nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False)
24 | 
25 |     def forward(self, x):
26 |         for i in range(25):
27 |             y = self.conv(x)
28 | 
29 | 
30 | def conv_loop(batchsize, train, local_rank, barriers, tid):
31 | 
32 |     print(batchsize, local_rank, barriers, tid)
33 |     barriers[0].wait()
34 | 
35 |     data = torch.rand([batchsize, 3, 224, 224]).to(local_rank).contiguous()
36 |     model = Model()
37 |     model = model.to(0)
38 | 
39 |     if train:
40 |         model.train()
41 |     else:
42 |         model.eval()
43 | 
44 |     for i in range(10):
45 |         print("Start epoch: ", i)
46 | 
47 |         start = time.time()
48 |         start_iter = time.time()
49 | 
50 |         batch_idx = 0
51 | 
52 |         while batch_idx < 1:
53 | 
54 |             print(f"submit!, batch_idx is {batch_idx}")
55 | 
56 |             if train:
57 |                 output = model(data)
58 |             else:
59 |                 with torch.no_grad():
60 |                     output = model(data)
61 | 
62 | 
63 |             batch_idx += 1
64 | 
65 |             start_iter = time.time()
66 | 
67 |         #barriers[0].wait()
68 |         if i < 9:
69 |             barriers[0].wait()
70 |         print(f"{tid}, Epoch done!")
71 | 
72 |     print("Finished! Ready to join!")
73 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/examples/basic_config_bert.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 
 3 | 	{
 4 | 		"arch": "bert",
 5 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb0",
 6 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/bert_8_fb1",
 7 | 		"num_kernels": 4777,
 8 | 		"additional_num_kernels": 4777,
 9 | 		"num_iters": 100,
10 | 		"args": {
11 | 			"batchsize": 8,
12 | 			"rps": 0,
13 | 			"uniform": false,
14 | 			"dummy_data": false,
15 | 			"train": true
16 | 		}
17 | 	}
18 | ]
19 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/examples/basic_config_transformer.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 
 3 | 	{
 4 | 		"arch": "transformer",
 5 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb0",
 6 | 		"additional_kernel_file": "/root/orion/benchmarking/model_kernels/transformer_xl_8_fb1",
 7 | 		"num_kernels": 4396,
 8 | 		"additional_num_kernels": 4354,
 9 | 		"num_iters": 100,
10 | 		"args": {
11 | 			"batchsize": 8,
12 | 			"rps": 0,
13 | 			"dummy_data": true,
14 | 			"uniform": false,
15 | 			"train": true
16 | 		}
17 | 	}
18 | ]
19 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/examples/basic_config_vision.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/benchmarking/model_kernels/resnet50_4_fwd",
 5 |         "num_kernels": 175,
 6 |         "num_iters": 2000,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 |             "rps": 30,
11 | 			"uniform": true,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	}
16 | 
17 | ]
18 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/extract_meas.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | input_file = sys.argv[1]
 4 | 
 5 | i=0
 6 | ar = [0, 0]
 7 | ar_str = []
 8 | 
 9 | 
10 | # with open(input_file, 'r') as f:
11 | #      while (1):
12 | #          l = f.readline()
13 | #          i+=1
14 | #          print(i, l)
15 | 
16 | 
17 | tt = []
18 | iters = []
19 | with open(input_file, 'r') as f:
20 |     lines = f.readlines()
21 |     for i,l in enumerate(lines):
22 | 
23 |         # if 'p50' in l:
24 |         #       tokens = l.split(",")
25 |         #       #print(l)
26 |         #       if "Client 0" in tokens[0]:
27 |         #           ar[0] = round(float(tokens[2].split(" ")[-2])*1000, 2)
28 |         #       else:
29 |         #           ar[1] = round(float(tokens[2].split(" ")[-2])*1000, 2)
30 |         #       i += 1
31 |         #       if (i%1==0):
32 |         #           #s = f"{ar[0]}/{ar[1]}"
33 |         #           s = f"{ar[1]}"
34 |         #           ar_str.append(s)
35 |         #       if (i==10):
36 |         #           i=0
37 | 
38 |         if 'Total loop' in l and 'Client' not in l:
39 |              tokens = l.split(" ")
40 |              tt.append(round(float(tokens[-2]),2))
41 |         if '=======' in l:
42 |              tokens = l.split(" ")
43 |              iters.append(int(tokens[-2]))
44 | print(len(tt))
45 | 
46 | #for i in range(5):
47 | #      print(f"{ar_str[5*i]},{ar_str[5*i+1]},{ar_str[5*i+2]},{ar_str[5*i+3]},{ar_str[5*i+4]}")
48 | 
49 | # for i in range(5):
50 | #     print(f"{tt[5*i]},{tt[5*i+1]},{tt[5*i+2]},{tt[5*i+3]},{tt[5*i+4]}")
51 | 
52 | # for i in range(5):
53 | #      print(f"{iters[5*i]},{iters[5*i+1]},{iters[5*i+2]},{iters[5*i+3]},{iters[5*i+4]}")
54 | 
55 | inf_requests = [9200, 12000, 5500, 1200, 3400]
56 | 
57 | #print("--------- High Priority Throughput:")
58 | # hp_th = []
59 | # for i in range(len(tt)):
60 | #       hp_th.append(round(inf_requests[i%5]/tt[i],2))
61 | # for i in range(5):
62 | #        print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}")
63 | 
64 | # #print("--------- Low Priority Throughput:")
65 | 
66 | hp_th = []
67 | for i in range(len(tt)):
68 |      hp_th.append(round(iters[i]/tt[i],2))
69 | for i in range(5):
70 |      print(f"{iters[5*i]},{iters[5*i+1]},{iters[5*i+2]},{iters[5*i+3]},{iters[5*i+4]}")
71 | 
72 | 
73 | # print("--------- High Priority Throughput:")
74 | # hp_th = []
75 | # for i in range(len(tt)):
76 | #     hp_th.append(round(1000/tt[i],2))
77 | # for i in range(5):
78 | #      print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}")
79 | 
80 | # print("--------- Low Priority Throughput:")
81 | # hp_th = []
82 | # for i in range(len(tt)):
83 | #     hp_th.append(round(iters[i]/tt[i],2))
84 | # for i in range(5):
85 | #      print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}")
86 | 
87 | 
88 | print("--------- Total Throughput:")
89 | hp_th = []
90 | for i in range(len(tt)):
91 |     hp_th.append(round((iters[i]+1000)/tt[i],2))
92 | for i in range(5):
93 |      print(f"{hp_th[5*i]},{hp_th[5*i+1]},{hp_th[5*i+2]},{hp_th[5*i+3]},{hp_th[5*i+4]}")
94 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/toy_models/bnorm_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | from torchvision import models, datasets, transforms
 9 | from torch.nn.parallel import DistributedDataParallel as DDP
10 | import torch.nn.functional as F
11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
12 | from datetime import timedelta
13 | import random
14 | import numpy as np
15 | import time
16 | import os
17 | import argparse
18 | import threading
19 | 
20 | class Model(torch.nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.bn = torch.nn.BatchNorm2d(256)
24 | 
25 |     def forward(self, x):
26 |         for i in range(25):
27 |             y = self.bn(x)
28 | 
29 | 
30 | def bnorm_loop(batchsize, train, local_rank, barriers, tid):
31 | 
32 |     print(batchsize, local_rank, barriers, tid)
33 |     barriers[0].wait()
34 | 
35 |     data = torch.rand([batchsize, 256, 112, 112]).to(local_rank).contiguous()
36 |     model = Model()
37 |     model = model.to(0)
38 | 
39 |     if train:
40 |         model.train()
41 |     else:
42 |         model.eval()
43 | 
44 |     for i in range(10):
45 |         print("Start epoch: ", i)
46 | 
47 |         start = time.time()
48 |         start_iter = time.time()
49 | 
50 |         batch_idx = 0
51 | 
52 |         while batch_idx < 1:
53 | 
54 |             print(f"submit!, batch_idx is {batch_idx}")
55 | 
56 |             if train:
57 |                 output = model(data)
58 |             else:
59 |                 with torch.no_grad():
60 |                     output = model(data)
61 | 
62 | 
63 |             batch_idx += 1
64 | 
65 |             start_iter = time.time()
66 | 
67 |         # barriers[0].wait()
68 |         if i < 9:
69 |             barriers[0].wait()
70 |         print(f"{tid}, Epoch done!")
71 | 
72 |     print("Finished! Ready to join!")
73 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/toy_models/conv_bn_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | from torchvision import models, datasets, transforms
 9 | from torch.nn.parallel import DistributedDataParallel as DDP
10 | import torch.nn.functional as F
11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
12 | from datetime import timedelta
13 | import random
14 | import numpy as np
15 | import time
16 | import os
17 | import argparse
18 | import threading
19 | 
20 | class Model(torch.nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False)
24 |         self.bn = torch.nn.BatchNorm2d(64)
25 |         self.x = torch.rand([32, 64, 112, 112]).to(0)
26 | 
27 |     def forward(self, x):
28 |         for i in range(25):
29 |             y = self.conv(x)
30 |             z = self.bn(y)
31 | 
32 | 
33 | def conv_bn_loop(batchsize, train, local_rank, barriers, tid):
34 | 
35 |     print(batchsize, local_rank, barriers, tid)
36 |     barriers[0].wait()
37 | 
38 |     data = torch.rand([batchsize, 3, 224, 224]).to(local_rank)
39 |     model = Model()
40 |     model = model.to(0)
41 | 
42 |     if train:
43 |         model.train()
44 |     else:
45 |         model.eval()
46 | 
47 |     for i in range(10):
48 |         print("Start epoch: ", i)
49 | 
50 |         start = time.time()
51 |         start_iter = time.time()
52 | 
53 |         batch_idx = 0
54 | 
55 |         while batch_idx < 1:
56 | 
57 |             print(f"submit!, batch_idx is {batch_idx}")
58 | 
59 |             if train:
60 |                 output = model(data)
61 |             else:
62 |                 with torch.no_grad():
63 |                     output = model(data)
64 | 
65 | 
66 |             batch_idx += 1
67 | 
68 |             start_iter = time.time()
69 | 
70 |         #barriers[0].wait()
71 |         if i < 9:
72 |             barriers[0].wait()
73 |         print(f"{tid}, Epoch done!")
74 | 
75 |     print("Finished! Ready to join!")
76 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/transformer_trainer_torch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import threading
  3 | import time
  4 | 
  5 | from mem_transformer import MemTransformerLM
  6 | import lamb
  7 | import numpy as np
  8 | 
  9 | class DummyDataLoader():
 10 |     def __init__(self, batchsize):
 11 |         self.batchsize = batchsize
 12 |         self.data = torch.ones((192, self.batchsize), pin_memory=True).to(torch.int64)
 13 |         self.target = torch.ones((192, self.batchsize), pin_memory=True).to(torch.int64)
 14 | 
 15 |     def __iter__(self):
 16 |         return self
 17 | 
 18 |     def __next__(self):
 19 |         return self.data, self.target
 20 | 
 21 | def transformer_loop(batchsize, train, default, num_iters, rps, uniform, dummy_data, local_rank, start_barriers, end_barriers, tid):
 22 | 
 23 |     start_barriers[0].wait()
 24 | 
 25 |     if rps > 0:
 26 |         if uniform:
 27 |             sleep_times = [1/rps]*num_iters
 28 |         else:
 29 |             sleep_times = np.random.exponential(scale=1/rps, size=num_iters)
 30 |     else:
 31 |         sleep_times = [0]*num_iters
 32 | 
 33 |     if default:
 34 |         s = torch.cuda.default_stream()
 35 |     else:
 36 |         s = torch.cuda.Stream()
 37 |     timings = []
 38 | 
 39 |     model_config = {
 40 |         'n_token': 267735,
 41 |         'n_layer': 16,
 42 |         'n_head': 8,
 43 |         'd_model': 512,
 44 |         'd_head': 64,
 45 |         'd_inner': 2048,
 46 |         'dropout': 0.1,
 47 |         'dropatt': 0.0,
 48 |         'dtype': None,
 49 |         'tie_weight': True,
 50 |         'd_embed': 512,
 51 |         'div_val': 1,
 52 |         'tie_projs': [False, True, True, True],
 53 |         'pre_lnorm': False,
 54 |         'tgt_len': 192,
 55 |         'ext_len': 0,
 56 |         'mem_len': 192,
 57 |         'cutoffs': [19997, 39997, 199997],
 58 |         'same_length': False,
 59 |         'attn_type': 0,
 60 |         'clamp_len': -1,
 61 |         'sample_softmax': -1
 62 |     }
 63 | 
 64 |     train_loader = DummyDataLoader(batchsize)
 65 |     train_iter = enumerate(train_loader)
 66 |     batch_idx, batch = next(train_iter)
 67 | 
 68 |     model = MemTransformerLM(**model_config).to(0)
 69 | 
 70 |     if train:
 71 |         model.train()
 72 |         optimizer = lamb.Lamb(model.parameters(), lr=0.1)
 73 |     else:
 74 |         model.eval()
 75 | 
 76 |     next_startup = time.time()
 77 |     open_loop = False
 78 |     timings = [0 for _ in range(num_iters)]
 79 | 
 80 |     mems = None
 81 |     print("before while")
 82 |     with torch.cuda.stream(s):
 83 |         for i in range(1):
 84 |             print("Start epoch: ", i)
 85 | 
 86 |             while batch_idx < num_iters:
 87 |                 start = time.time()
 88 | 
 89 |                 if train:
 90 |                     optimizer.zero_grad()
 91 |                     start_iter = time.time()
 92 |                     data, target = batch[0].to(local_rank), batch[1].to(local_rank)
 93 |                     loss, mems = model(data, target, mems)
 94 |                     loss = loss.float().mean().type_as(loss)
 95 |                     loss.backward()
 96 |                     optimizer.step()
 97 |                     #s.synchronize()
 98 |                     print(f"Client {tid}, iter {batch_idx} took {time.time()-start_iter} sec")
 99 |                     batch_idx,batch = next(train_iter)
100 |                     if (batch_idx==10):
101 |                         starttime = time.time()
102 |                 else:
103 |                     with torch.no_grad():
104 |                         cur_time = time.time()
105 |                         ###### OPEN LOOP #####
106 |                         if (cur_time >= next_startup):
107 |                             print(f"Client {tid} submit!, batch_idx is {batch_idx}")
108 |                             data, target = batch[0].to(local_rank), batch[1].to(local_rank)
109 |                             output, mems = model(data, target, mems)
110 |                             s.synchronize()
111 |                             timings[batch_idx] = time.time()-next_startup
112 |                             print(f"It took {timings[batch_idx]} sec")
113 |                             next_startup += sleep_times[batch_idx]
114 |                             batch_idx,batch = next(train_iter)
115 |                             if (batch_idx==10):
116 |                                 starttime = time.time()
117 | 
118 |             print(f"FINISHED! It took {time.time()-starttime} sec")
119 |             end_barriers[0].wait()
120 | 
121 |     #print(f"Time is {time.time()-starttime} sec")
122 |     if not train:
123 |         timings = timings[2:]
124 |         p50 = np.percentile(timings, 50)
125 |         p95 = np.percentile(timings, 95)
126 |         p99 = np.percentile(timings, 99)
127 | 
128 |         print(f"Client {tid} finished! p50: {p50} sec, p95: {p95} sec, p99: {p99} sec")
129 |         print(f"Total time is {time.time()-starttime} sec")
130 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/utility_scripts/check_unknown.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | ifile = sys.argv[1]
 5 | 
 6 | sms_used = []
 7 | durations = []
 8 | durations_all = []
 9 | with open(ifile, 'r') as f:
10 |     lines = f.readlines()
11 |     for l in lines[1:]:
12 |         tokens = l.split(",")
13 |         profile = int(tokens[1])
14 |         if profile==-1:
15 |             sms_used.append(int(tokens[-2]))
16 |             durations.append(float(tokens[-1])/1000)
17 |         durations_all.append(float(tokens[-1])/1000)
18 | 
19 | np.set_printoptions(threshold=np.inf)
20 | print(np.sort(sms_used))
21 | print(np.sort(durations))
22 | print(len(sms_used)/len(durations_all))
23 | #print(f"average: {np.average(np.asarray(durations))}")
24 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/utility_scripts/compute_average.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | inf_durations={
 5 |     #"resnet50": 6280,
 6 |     # "mobilenet": 4940,
 7 |     # "resnet101": 10500,
 8 |     # "bert": 49020,
 9 |     "transformer": 17000
10 | }
11 | 
12 | ifile = sys.argv[1]
13 | 
14 | durations = []
15 | sms_used = []
16 | 
17 | in_between = []
18 | 
19 | with open(ifile, 'r') as f:
20 |     lines = f.readlines()
21 |     for l in lines[1:]:
22 |         tokens = l.split(",")
23 |         sms_used.append(int(tokens[-2]))
24 |         dur = float(tokens[-1])/1000
25 |         durations.append(dur)
26 |         if (dur>=320 and dur<=350):
27 |             print(l)
28 |             in_between.append(dur)
29 | 
30 | avg_duration = np.average(np.asarray(durations))
31 | max_duration = max(durations)
32 | # print(np.sort(durations))
33 | # print(np.sort(sms_used))
34 | 
35 | p50 = np.percentile(durations, 50)
36 | p75 = np.percentile(durations, 75)
37 | p95 = np.percentile(durations, 95)
38 | p99 = np.percentile(durations, 99)
39 | 
40 | print(len(in_between))
41 | 
42 | for hp_inference in inf_durations:
43 |     D = (0.025 * inf_durations[hp_inference])/avg_duration
44 |     print(f"{hp_inference}, Average duration: {avg_duration} us, max duration is {max_duration} us, hp duration is {inf_durations[hp_inference]} us, D is {D}")


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/utility_scripts/download_imagenet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # a script to download the image net dataset
 3 | set -x -e
 4 | DATA_DIR=${1:-/cluster/scratch/xianma/vision}
 5 | aria2c -c -x 10 -s 10 -d "$DATA_DIR" --download-result=full https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar
 6 | # now the tar lives in DATA_DIR
 7 | cd "$DATA_DIR"
 8 | mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
 9 | tar -xvf ILSVRC2012_img_train.tar
10 | # move the entire compressed file out of the train folder as the folder should contain actual data
11 | mv ILSVRC2012_img_train.tar ../
12 | 
13 | # the last line should be executed non-interactively (e.g. as a sbatch job) because it is really time-consuming, to unzip each tar
14 | # find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
15 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark_suite/utility_scripts/get_avg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | l1 = [[0,0,0,0,0] for _ in range(5)]
 4 | l2 = [[0,0,0,0,0] for _ in range(5)]
 5 | 
 6 | with open('case1', 'r') as f1:
 7 |     f1.readline()
 8 |     #f1.readline()
 9 |     l = f1.readlines()
10 |     for i in range(5):
11 |         t = l[i]
12 |         s = t.split(",")
13 |         for j in range(5):
14 |             l1[i][j] = float(s[j])
15 | 
16 | with open('case2', 'r') as f2:
17 |     f2.readline()
18 |     #f2.readline()
19 |     l = f2.readlines()
20 | 
21 |     for i in range(5):
22 |         t = l[i]
23 |         s = t.split(",")
24 |         for j in range(5):
25 |             l2[i][j] = float(s[j])
26 | print(l1, l2)
27 | 
28 | l_total = [[0,0,0,0,0] for _ in range(5)]
29 | l_std =  [[0,0,0,0,0] for _ in range(5)]
30 | l_str = [[0,0,0,0,0] for _ in range(5)]
31 | 
32 | for i in range(5):
33 |     for j in range(5):
34 |         l_total[i][j] = round(np.average([l1[i][j],l2[i][j]]),2)
35 |         l_std[i][j] = round(np.std([l1[i][j],l2[i][j]]),2)
36 |         l_str[i][j] = str(l_total[i][j]) + "/" + str(l_std[i][j])
37 | for i in range(5):
38 |     print(f"{l_total[i][0]}/{l_std[i][0]},{l_total[i][1]}/{l_std[i][1]},{l_total[i][2]}/{l_std[i][2]},{l_total[i][3]}/{l_std[i][3]},{l_total[i][4]}/{l_std[i][4]}")
39 | 


--------------------------------------------------------------------------------
/benchmarking/hp.json:
--------------------------------------------------------------------------------
1 | {"p50_latency": 12.213349342346191, "p95_latency": 21.53183221817016, "p99_latency": 24.332609176635742, "throughput": 20.809929229394967}


--------------------------------------------------------------------------------
/benchmarking/multi_client_example.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 	{
 3 | 		"arch": "resnet50",
 4 | 		"kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd",
 5 |         "num_kernels": 175,
 6 |         "num_iters": 20000,
 7 | 		"args": {
 8 | 			"model_name": "resnet50",
 9 | 			"batchsize": 4,
10 |             "rps": 30,
11 | 			"uniform": false,
12 | 			"dummy_data": true,
13 | 			"train": false
14 | 		}
15 | 	},
16 | 	{
17 | 		"arch": "resnet50",
18 | 		"kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd",
19 |         "num_kernels": 175,
20 |         "num_iters": 20000,
21 | 		"args": {
22 | 			"model_name": "resnet50",
23 | 			"batchsize": 4,
24 |             "rps": 30,
25 | 			"uniform": false,
26 | 			"dummy_data": true,
27 | 			"train": false
28 | 		}
29 | 	},
30 | 	{
31 | 		"arch": "resnet50",
32 | 		"kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd",
33 |         "num_kernels": 175,
34 |         "num_iters": 20000,
35 | 		"args": {
36 | 			"model_name": "resnet50",
37 | 			"batchsize": 4,
38 |             "rps": 30,
39 | 			"uniform": false,
40 | 			"dummy_data": true,
41 | 			"train": false
42 | 		}
43 | 	},
44 | 	{
45 | 		"arch": "resnet50",
46 | 		"kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd",
47 |         "num_kernels": 175,
48 |         "num_iters": 20000,
49 | 		"args": {
50 | 			"model_name": "resnet50",
51 | 			"batchsize": 4,
52 |             "rps": 30,
53 | 			"uniform": false,
54 | 			"dummy_data": true,
55 | 			"train": false
56 | 		}
57 | 	},
58 | 	{
59 | 		"arch": "resnet50",
60 | 		"kernel_file": "/root/orion/artifact_evaluation/example/resnet50_4_fwd",
61 |         "num_kernels": 175,
62 |         "num_iters": 2000,
63 | 		"args": {
64 | 			"model_name": "resnet50",
65 | 			"batchsize": 4,
66 |             "rps": 30,
67 | 			"uniform": false,
68 | 			"dummy_data": true,
69 | 			"train": false
70 | 		}
71 | 	}
72 | ]
73 | 


--------------------------------------------------------------------------------
/benchmarking/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python run_traces.py > results/inf_inf_res/reef_updated_poisson_1
4 | python run_traces.py > results/inf_inf_res/reef_updated_poisson_2
5 | 


--------------------------------------------------------------------------------
/benchmarking/scripts/run_traces.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | trace_files = [
 5 |     "rnet_rnet_ti",
 6 |     "rnet_mnet_ti",
 7 |     "mnet_rnet_ti",
 8 |     "mnet_mnet_ti",
 9 |     "rnet101_rnet_ti",
10 |     "rnet101_mnet_ti",
11 |     "bert_rnet_ti",
12 |     "bert_mnet_ti",
13 |     "trans_rnet_ti",
14 |     "trans_mnet_ti",
15 | ]
16 | 
17 | 
18 | 
19 | #orion, hp is inference - uniform
20 | # depths = [110000, 100000, 150000, 1250000, 400000,
21 | #           110000, 100000, 150000, 1250000, 400000,
22 | #           110000, 100000, 150000, 1250000, 400000,
23 | #           110000, 100000, 150000, 1250000, 400000,
24 | #           110000, 100000, 150000, 1250000, 400000]
25 | 
26 | # orion, hp is inference, threshold is 0.05
27 | # depths = [320000, 300000, 400000, 2500000, 800000,
28 | #           320000, 300000, 400000, 2500000, 800000,
29 | #           320000, 300000, 400000, 2500000, 800000,
30 | #           320000, 300000, 400000, 2500000, 800000,
31 | #           320000, 300000, 400000, 2500000, 800000]
32 | 
33 | depths = [
34 |     6,5,10,48,16,
35 |     8,6,13,58,21,
36 |     8,6,13,60,21,
37 |     16,13,27,123,43,
38 |     22,17,36,170,59,
39 | ]
40 | 
41 | limits = [1,1,1,1,1,
42 |           1,1,1,1,1,
43 |           1,1,1,1,1,
44 |           1,1,1,1,1,
45 |           1,1,1,1,1]
46 | updates = [1,1,1,1,1,
47 |            1,1,1,1,1,
48 |            1,1,1,1,1,
49 |            1,1,1,1,1,
50 |            1,1,1,1,1]
51 | 
52 | 
53 | # # orion, hp is training
54 | # depths = [
55 | #     1000000, 1000000, 1000000, 40000000, 32000000,
56 | #     1000000, 1000000, 1000000, 40000000, 32000000,
57 | #     1000000, 1000000, 1000000, 40000000, 32000000,
58 | #     1000000, 1000000, 1000000, 40000000, 32000000,
59 | #     1000000, 1000000, 1000000, 40000000, 32000000
60 | # ]
61 | # limits = [
62 | #     135, 120, 235, 250, 250,
63 | #     135, 120, 235, 250, 250,
64 | #     135, 120, 235, 250, 250,
65 | #     135, 120, 235, 250, 250,
66 | #     135, 120, 235, 250, 250
67 | # ]
68 | # updates = [
69 | #     768, 733, 1534, 2669, 1622,
70 | #     768, 733, 1534, 2669, 1622,
71 | #     768, 733, 1534, 2669, 1622,
72 | #     768, 733, 1534, 2669, 1622,
73 | #     768, 733, 1534, 2669, 1622
74 | # ]
75 | 
76 | 
77 | 
78 | print(len(trace_files), len(depths))
79 | assert len(trace_files) == len(depths)
80 | for f,d,l,u in zip(trace_files, depths, limits, updates):
81 |     print(f,d, flush=True)
82 |     file_path = f"eval/inf_inf/poisson/{f}.json"
83 |     os.system(f"python launch_jobs.py {file_path} {d} {l} {u}")
84 |     time.sleep(10)
85 | 


--------------------------------------------------------------------------------
/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd src/cuda_capture && make libinttemp.so && cd ../../
4 | cd src/scheduler && make scheduler_eval.so && cd ../../
5 | 


--------------------------------------------------------------------------------
/orion_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/orion_architecture.png


--------------------------------------------------------------------------------
/profiling/benchmarks/bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import threading
 3 | import time
 4 | import modeling
 5 | 
 6 | from optimization import BertAdam
 7 | 
 8 | def bert(batchsize, local_rank, do_eval=True, profile=True):
 9 | 
10 |     model_config = {
11 |         "attention_probs_dropout_prob": 0.1,
12 |         "hidden_act": "gelu",
13 |         "hidden_dropout_prob": 0.1,
14 |         "hidden_size": 768,
15 |         "initializer_range": 0.02,
16 |         "intermediate_size": 3072,
17 |         "max_position_embeddings": 512,
18 |         "num_attention_heads": 12,
19 |         "num_hidden_layers": 12,
20 |         "type_vocab_size": 2,
21 |         "vocab_size": 30522
22 |     }
23 | 
24 |     config = modeling.BertConfig.from_dict(model_config)
25 |     # Padding for divisibility by 8
26 |     if config.vocab_size % 8 != 0:
27 |         config.vocab_size += 8 - (config.vocab_size % 8)
28 | 
29 | 
30 |     input_ids = torch.ones((batchsize, 384)).to(torch.int64).to(0)
31 |     segment_ids = torch.ones((batchsize, 384)).to(torch.int64).to(0)
32 |     input_mask = torch.ones((batchsize, 384)).to(torch.int64).to(0)
33 |     start_positions = torch.zeros((batchsize)).to(torch.int64).to(0)
34 |     end_positions = torch.ones((batchsize)).to(torch.int64).to(0)
35 | 
36 | 
37 |     model = modeling.BertForQuestionAnswering(config).to(0)
38 | 
39 |     if do_eval:
40 |         model.eval()
41 |     else:
42 |         model.train()
43 |         param_optimizer = list(model.named_parameters())
44 | 
45 |         param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
46 | 
47 |         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
48 |         optimizer_grouped_parameters = [
49 |             {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
50 |             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
51 |         ]
52 |         optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=100)
53 | 
54 |     batch_idx = 0
55 |     torch.cuda.synchronize()
56 | 
57 |     while batch_idx < 1:
58 | 
59 |         if batch_idx == 0:
60 |             if profile == 'ncu':
61 |                 torch.cuda.nvtx.range_push("start")
62 |             elif profile == 'nsys':
63 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
64 | 
65 |         if do_eval:
66 |             with torch.no_grad():
67 |                 output = model(input_ids, segment_ids, input_mask)
68 |         else:
69 |             optimizer.zero_grad()
70 |             start_logits, end_logits = model(input_ids, segment_ids, input_mask)
71 |             ignored_index = start_logits.size(1)
72 |             loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
73 |             start_loss = loss_fct(start_logits, start_positions)
74 |             end_loss = loss_fct(end_logits, end_positions)
75 |             loss = (start_loss + end_loss) / 2
76 |             loss.backward()
77 |             optimizer.step()
78 | 
79 |         if batch_idx == 0:
80 |             if profile == 'ncu':
81 |                 torch.cuda.nvtx.range_pop()
82 |             elif profile == 'nsys':
83 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
84 | 
85 |         batch_idx += 1
86 | 
87 |     print("Done!")
88 | 
89 | if __name__ == "__main__":
90 |     bert(8, 0,False, 'nsys')
91 | 


--------------------------------------------------------------------------------
/profiling/benchmarks/bnorm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | from torchvision import models, datasets, transforms
 9 | from torch.nn.parallel import DistributedDataParallel as DDP
10 | import torch.nn.functional as F
11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
12 | from datetime import timedelta
13 | import random
14 | import numpy as np
15 | import time
16 | import os
17 | import argparse
18 | import threading
19 | 
20 | 
21 | class Model(torch.nn.Module):
22 |     def __init__(self):
23 |         super().__init__()
24 |         self.bn = torch.nn.BatchNorm2d(64)
25 | 
26 |     def forward(self, x):
27 |         for i in range(25):
28 |             x = self.bn(x)
29 | 
30 | 
31 | def bnorm_loop(batchsize, local_rank, do_eval=True, profile=None):
32 | 
33 |     print("-------------- thread id:  ", threading.get_native_id())
34 | 
35 |     data = torch.rand([batchsize, 64, 112, 112]).to(local_rank).contiguous()
36 |     model = Model()
37 |     model = model.to(0)
38 | 
39 |     if do_eval:
40 |         model.eval()
41 |     else:
42 |         model.train()
43 | 
44 |     print("Enter loop!")
45 | 
46 |     batch_idx = 0
47 |     torch.cuda.synchronize()
48 | 
49 |     while batch_idx < 10:
50 | 
51 |         if batch_idx == 9:
52 |             if profile == 'ncu':
53 |                 torch.cuda.nvtx.range_push("start")
54 |             elif profile == 'nsys':
55 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
56 | 
57 |         if do_eval:
58 |             with torch.no_grad():
59 |                 output = model(data)
60 |         else:
61 |             output = model(data)
62 | 
63 |         if batch_idx == 9:
64 |             if profile == 'ncu':
65 |                 torch.cuda.nvtx.range_pop()
66 |             elif profile == 'nsys':
67 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
68 | 
69 |         batch_idx += 1
70 | 
71 |     print("Done!")
72 | 
73 | if __name__ == "__main__":
74 |     bnorm_loop(32, 0, False, 'nsys')


--------------------------------------------------------------------------------
/profiling/benchmarks/conv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | from torchvision import models, datasets, transforms
 9 | from torch.nn.parallel import DistributedDataParallel as DDP
10 | import torch.nn.functional as F
11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
12 | from datetime import timedelta
13 | import random
14 | import numpy as np
15 | import time
16 | import os
17 | import argparse
18 | import threading
19 | 
20 | class Model(torch.nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False)
24 | 
25 |     def forward(self, x):
26 |         for i in range(25):
27 |             y = self.conv(x)
28 | 
29 | 
30 | 
31 | def conv_loop(batchsize, local_rank, do_eval=True, profile=None):
32 | 
33 |     print("-------------- thread id:  ", threading.get_native_id())
34 | 
35 |     data = torch.rand([batchsize, 3, 224, 224]).to(local_rank).contiguous()
36 |     model = Model()
37 |     model = model.to(0)
38 | 
39 |     if do_eval:
40 |         model.eval()
41 |     else:
42 |         model.train()
43 | 
44 |     print("Enter loop!")
45 | 
46 |     batch_idx = 0
47 |     torch.cuda.synchronize()
48 | 
49 |     while batch_idx < 10:
50 | 
51 |         if batch_idx == 9:
52 |             if profile == 'ncu':
53 |                 torch.cuda.nvtx.range_push("start")
54 |             elif profile == 'nsys':
55 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
56 | 
57 |         if do_eval:
58 |             with torch.no_grad():
59 |                 output = model(data)
60 |         else:
61 |             output = model(data)
62 | 
63 |         if batch_idx == 9:
64 |             if profile == 'ncu':
65 |                 torch.cuda.nvtx.range_pop()
66 |             elif profile == 'nsys':
67 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
68 | 
69 |         batch_idx += 1
70 | 
71 |     print("Done!")
72 | 
73 | if __name__ == "__main__":
74 |     conv_loop(32, 0, False, 'nsys')


--------------------------------------------------------------------------------
/profiling/benchmarks/conv_bnorm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | from torchvision import models, datasets, transforms
 9 | from torch.nn.parallel import DistributedDataParallel as DDP
10 | import torch.nn.functional as F
11 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
12 | from datetime import timedelta
13 | import random
14 | import numpy as np
15 | import time
16 | import os
17 | import argparse
18 | import threading
19 | 
20 | class Model(torch.nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.conv = torch.nn.Conv2d(3,64,kernel_size=7, stride=2, padding=3, bias=False)
24 |         self.bn = torch.nn.BatchNorm2d(64)
25 | 
26 |     def forward(self, x):
27 |         for i in range(25):
28 |             y = self.conv(x)
29 |             z = self.bn(y)
30 | 
31 | 
32 | def conv_bnorm_loop(batchsize, local_rank, do_eval=True, profile=None):
33 | 
34 |     print("-------------- thread id:  ", threading.get_native_id())
35 | 
36 |     data = torch.rand([batchsize, 3, 224, 224]).to(local_rank).contiguous()
37 |     model = Model()
38 |     model = model.to(0)
39 | 
40 |     if do_eval:
41 |         model.eval()
42 |     else:
43 |         model.train()
44 | 
45 |     print("Enter loop!")
46 | 
47 |     batch_idx = 0
48 |     torch.cuda.synchronize()
49 | 
50 |     while batch_idx < 10:
51 | 
52 |         if batch_idx == 9:
53 |             if profile == 'ncu':
54 |                 torch.cuda.nvtx.range_push("start")
55 |             elif profile == 'nsys':
56 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
57 | 
58 |         if do_eval:
59 |             with torch.no_grad():
60 |                 output = model(data)
61 |         else:
62 |             output = model(data)
63 | 
64 |         if batch_idx == 9:
65 |             if profile == 'ncu':
66 |                 torch.cuda.nvtx.range_pop()
67 |             elif profile == 'nsys':
68 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
69 | 
70 |         batch_idx += 1
71 | 
72 |     print("Done!")
73 | 
74 | if __name__ == "__main__":
75 |     conv_bnorm_loop(32, 0, False, 'nsys')


--------------------------------------------------------------------------------
/profiling/benchmarks/gnmt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import threading
 3 | import time
 4 | import sys
 5 | 
 6 | sys.path.insert(0, f"{os.path.expanduser( '~' )}/DeepLearningExamples/PyTorch/Translation/GNMT")
 7 | 
 8 | 
 9 | from seq2seq.models.gnmt import GNMT
10 | 
11 | def gnmt(batchsize, local_rank, do_eval=True, profile=None):
12 | 
13 |     model_config = {
14 | 
15 |         "hidden_size": 1024,
16 |         "vocab_size": 32320,
17 |         "num_layers": 4,
18 |         "dropout": 0.2,
19 |         "batch_first": False,
20 |         "share_embedding": True
21 |     }
22 | 
23 |     input0 = torch.ones([50, batchsize]).to(torch.int64).to(0)
24 |     input1 = torch.ones([batchsize]).to(torch.int64).to(0)
25 |     input2 = torch.ones([50, batchsize]).to(torch.int64).to(0)
26 |     labels = input2
27 | 
28 |     model = GNMT(**model_config).to(local_rank)
29 | 
30 |     if do_eval:
31 |         model.eval()
32 |     else:
33 |         model.train()
34 |         #criterion = LabelSmoothing(0.1, 0).to(local_rank)
35 |         #optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
36 | 
37 |     batch_idx = 0
38 |     torch.cuda.synchronize()
39 | 
40 |     while batch_idx < 10:
41 | 
42 |         #if not do_eval:
43 |         #    optimizer.zero_grad()
44 | 
45 |         if batch_idx == 0:
46 |             if profile == 'ncu':
47 |                 torch.cuda.nvtx.range_push("start")
48 |             elif profile == 'nsys':
49 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
50 | 
51 |         if do_eval:
52 |             with torch.no_grad():
53 |                 output = model(input0, input1, input2)
54 |         else:
55 |             output = model(input0, input1, input2)
56 |             #T, B = output.size(0), output.size(1)
57 |             #loss = criterion(output.view(T * B, -1), labels.contiguous().view(-1))
58 |             #loss.backward()
59 |             #optimizer.step()
60 | 
61 | 
62 |         if batch_idx == 9:
63 |             if profile == 'ncu':
64 |                 torch.cuda.nvtx.range_pop()
65 |             elif profile == 'nsys':
66 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
67 | 
68 |         batch_idx += 1
69 | 
70 |         print("Done!")
71 | 
72 | if __name__ == "__main__":
73 |     gnmt(128, 0, False, 'nsys')
74 | 


--------------------------------------------------------------------------------
/profiling/benchmarks/retinanet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import threading
 3 | import time
 4 | import sys
 5 | 
 6 | sys.path.append(f"{os.path.expanduser( '~' )}/mlcommons/single_stage_detector/ssd")
 7 | from model.retinanet import retinanet_from_backbone
 8 | 
 9 | def retinanet(batchsize, local_rank, do_eval=True, profile=None):
10 | 
11 |     model = retinanet_from_backbone(
12 |             backbone="resnext50_32x4d",
13 |             num_classes=264,
14 |             image_size=[800, 800],
15 |             data_layout='channels_last',
16 |             pretrained=False,
17 |             trainable_backbone_layers=3).cuda()
18 |     images = [torch.ones((3,768,1024)).to(torch.float32).cuda() for _ in range(batchsize)]
19 |     # just a dummy example
20 |     targets = [
21 |         {
22 |             'boxes': torch.tensor([[   3.8400,   42.2873,  597.1200,  660.5751],
23 |                             [ 367.3600, 2.5626, 1008.6400,  682.3594]]).cuda(),
24 |             'labels': torch.tensor([148, 257]).cuda(),
25 |             'image_id':  torch.tensor([299630]).cuda(),
26 |             'area': torch.tensor([366817.7812, 435940.0625]).cuda(),
27 |             'iscrowd': torch.tensor([0, 0]).cuda(),
28 |         }
29 |         for _ in range(batchsize)
30 |     ]
31 | 
32 |     if do_eval:
33 |         model.eval()
34 |     else:
35 |         model.train()
36 |         params = [p for p in model.parameters() if p.requires_grad]
37 |         optimizer = torch.optim.Adam(params, lr=0.1)
38 | 
39 |     batch_idx = 0
40 | 
41 |     while batch_idx < 10:
42 | 
43 |         print(f"run {batch_idx}")
44 | 
45 |         if batch_idx == 9:
46 |             if profile == 'ncu':
47 |                 torch.cuda.nvtx.range_push("start")
48 |             elif profile == 'nsys':
49 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
50 | 
51 |         if do_eval:
52 |             with torch.no_grad():
53 |                 output = model(images)
54 |         else:
55 |             optimizer.zero_grad()
56 |             loss_dict = model(images, targets)
57 |             losses = sum(loss for loss in loss_dict.values())
58 |             losses.backward()
59 |             optimizer.step()
60 | 
61 |         if batch_idx == 9:
62 |             if profile == 'ncu':
63 |                 torch.cuda.nvtx.range_pop()
64 |             elif profile == 'nsys':
65 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
66 | 
67 |         batch_idx += 1
68 | 
69 |     print("Done!")
70 | 
71 | if __name__ == "__main__":
72 |     retinanet(4, 0, True, 'nsys')
73 | 


--------------------------------------------------------------------------------
/profiling/benchmarks/transformer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import threading
 3 | import time
 4 | import sys
 5 | 
 6 | sys.path.append(f"{os.path.expanduser( '~' )}/DeepLearningExamples/PyTorch/LanguageModeling/Transformer-XL/pytorch")
 7 | 
 8 | from mem_transformer import MemTransformerLM
 9 | import lamb
10 | 
11 | def transformer(batchsize, local_rank, do_eval=True, profile=None):
12 | 
13 |     model_config = {
14 |         'n_token': 267735,
15 |         'n_layer': 16,
16 |         'n_head': 8,
17 |         'd_model': 512,
18 |         'd_head': 64,
19 |         'd_inner': 2048,
20 |         'dropout': 0.1,
21 |         'dropatt': 0.0,
22 |         'dtype': None,
23 |         'tie_weight': True,
24 |         'd_embed': 512,
25 |         'div_val': 1,
26 |         'tie_projs': [False, True, True, True],
27 |         'pre_lnorm': False,
28 |         'tgt_len': 192,
29 |         'ext_len': 0,
30 |         'mem_len': 192,
31 |         'cutoffs': [19997, 39997, 199997],
32 |         'same_length': False,
33 |         'attn_type': 0,
34 |         'clamp_len': -1,
35 |         'sample_softmax': -1
36 |     }
37 | 
38 |     data = torch.ones((192, batchsize)).to(torch.int64).cuda()
39 |     target = torch.ones((192, batchsize)).to(torch.int64).cuda()
40 | 
41 |     model = MemTransformerLM(**model_config).to(0)
42 | 
43 |     if do_eval:
44 |         model.eval()
45 |     else:
46 |         model.train()
47 |         optimizer = lamb.Lamb(model.parameters(), lr=0.1)
48 | 
49 |     torch.cuda.synchronize()
50 |     batch_idx = 0
51 |     mems = None
52 | 
53 |     while batch_idx < 10:
54 | 
55 |         start_iter = time.time()
56 |         if batch_idx == 0:
57 |             if profile == 'ncu':
58 |                 torch.cuda.nvtx.range_push("start")
59 |             elif profile == 'nsys':
60 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
61 | 
62 |         if do_eval:
63 |             with torch.no_grad():
64 |                 output = model(data, target, mems)
65 |         else:
66 |             optimizer.zero_grad()
67 |             loss, mems = model(data, target, mems)
68 |             loss = loss.float().mean().type_as(loss)
69 |             loss.backward()
70 |             optimizer.step()
71 | 
72 |         if batch_idx == 9:
73 |             if profile == 'ncu':
74 |                 torch.cuda.nvtx.range_pop()
75 |             elif profile == 'nsys':
76 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
77 | 
78 |         batch_idx += 1
79 |         print(f"It took {time.time()-start_iter} sec")
80 | 
81 |     print("Done!")
82 | 
83 | if __name__ == "__main__":
84 |     transformer(4, 0, True, 'ncu')
85 | 


--------------------------------------------------------------------------------
/profiling/benchmarks/vision_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from platform import node
 3 | import sched
 4 | import sys
 5 | import torch
 6 | import torch.distributed as dist
 7 | import torch.multiprocessing as mp
 8 | import torchvision
 9 | from torchvision import models, datasets, transforms
10 | from torch.nn.parallel import DistributedDataParallel as DDP
11 | import torch.nn.functional as F
12 | from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
13 | from datetime import timedelta
14 | import random
15 | import numpy as np
16 | import time
17 | import os
18 | import argparse
19 | import threading
20 | 
21 | print(torchvision.__file__)
22 | 
23 | 
24 | def vision(model_name, batchsize, local_rank, do_eval=True, profile=None):
25 | 
26 |     data = torch.ones([batchsize, 3, 224, 224], pin_memory=True).to(local_rank)
27 |     target = torch.ones([batchsize], pin_memory=True).to(torch.long).to(local_rank)
28 |     #data = torch.rand([batchsize, 2048]).to(local_rank)
29 |     model = models.__dict__[model_name](num_classes=1000)
30 |     model = model.to(local_rank)
31 | 
32 |     '''
33 |     train_dir = "/mnt/data/home/fot/imagenet/imagenet-raw-euwest4/"
34 |     train_transform =  transforms.Compose([
35 |                                 transforms.RandomResizedCrop(224),
36 |                                 transforms.RandomHorizontalFlip(),
37 |                                 transforms.ToTensor(),
38 |                                 transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))])
39 |     train_dataset = \
40 |             datasets.ImageFolder(train_dir,transform=train_transform)
41 | 
42 |     train_loader = torch.utils.data.DataLoader(
43 |                     train_dataset, batch_size=batchsize, num_workers=8)
44 | 
45 |     train_iter = enumerate(train_loader)
46 |     '''
47 | 
48 |     if do_eval:
49 |         model.eval()
50 |     else:
51 |         model.train()
52 |         optimizer =  torch.optim.SGD(model.parameters(), lr=0.1)
53 |         criterion =  torch.nn.CrossEntropyLoss().to(local_rank)
54 | 
55 |     batch_idx = 0
56 |     torch.cuda.synchronize()
57 |     start = time.time()
58 | 
59 | 
60 |     for batch_idx in range(1000): #batch in train_iter:
61 | 
62 |         #data, target = batch[0].to(local_rank), batch[1].to(local_rank)
63 |         start = time.time()
64 |         if batch_idx == 9:
65 |             if profile == 'ncu':
66 |                 torch.cuda.nvtx.range_push("start")
67 |             elif profile == 'nsys':
68 |                 torch.cuda.profiler.cudart().cudaProfilerStart()
69 |         if do_eval:
70 |             with torch.no_grad():
71 |                 output = model(data)
72 |         else:
73 |             optimizer.zero_grad()
74 |             output = model(data)
75 |             loss = criterion(output, target)
76 |             loss.backward()
77 |             optimizer.step()
78 | 
79 |         torch.cuda.synchronize()
80 |         if batch_idx == 9:
81 |             if profile == 'ncu':
82 |                 torch.cuda.nvtx.range_pop()
83 |             elif profile == 'nsys':
84 |                 torch.cuda.profiler.cudart().cudaProfilerStop()
85 |         #batch_idx += 1
86 | 
87 |         print(f"Iteration took {time.time()-start} sec")
88 | 
89 |     print(f"Done!, It took {time.time()-start_all} sec")
90 | 
91 | if __name__ == "__main__":
92 |     vision('mobilenet_v2', 4, 0, True, 'ncu')
93 | 


--------------------------------------------------------------------------------
/profiling/postprocessing/get_num_blocks.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from math import ceil, floor
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('--results_dir', type=str, required=True,
 7 |                         help='path to directory containing the profiling files')
 8 | parser.add_argument('--max_threads_sm', type=int, default=2048,
 9 |                         help='maximum number of threads that can be active in an SM')
10 | parser.add_argument('--max_blocks_sm', type=int, default=80,
11 |                         help='maximum number of blocks that can be active in an SM')
12 | parser.add_argument('--max_shmem_sm', type=int, default=65536,
13 |                         help='maximum amount of shared memory (in bytes) per SM')
14 | parser.add_argument('--max_regs_sm', type=int, default=65536,
15 |                         help='maximum number of registers per SM')
16 | args = parser.parse_args()
17 | 
18 | df = pd.read_csv(f'{args.results_dir}/output_ncu_processed.csv', index_col=0)
19 | 
20 | max_threads_sm = args.max_threads_sm
21 | max_blocks_sm = args.max_blocks_sm
22 | max_shmem_sm = args.max_shmem_sm
23 | max_regs_sm = args.max_regs_sm
24 | 
25 | sm_needed = []
26 | 
27 | for index, row in df.iterrows():
28 |     num_blocks = row['Grid']
29 |     num_threads = row['Number_of_threads']
30 |     threads_per_block = row['Block']
31 |     shmem_per_block = row['Static_shmem_per_block']
32 |     regs_per_thread = row['Registers_Per_Thread']
33 | 
34 |     # from threads
35 |     blocks_per_sm_threads = ceil(max_threads_sm/threads_per_block)
36 | 
37 |     # from shmem
38 |     if shmem_per_block > 0:
39 |         blocks_per_sm_shmem = ceil(max_shmem_sm/shmem_per_block)
40 |     else:
41 |         blocks_per_sm_shmem = blocks_per_sm_threads
42 | 
43 |     # from registers
44 |     regs_per_wrap = ceil(32*regs_per_thread/256) * 256
45 |     wraps_per_sm = floor((65536/regs_per_wrap)/4) * 4
46 |     wraps_per_block = ceil(threads_per_block/32)
47 |     blocks_per_sm_regs = int(wraps_per_sm/wraps_per_block)
48 | 
49 |     blocks_per_sm = min(blocks_per_sm_threads, blocks_per_sm_shmem, blocks_per_sm_regs)
50 |     sm_needed_kernel = ceil(num_blocks/blocks_per_sm)
51 | 
52 |     #print(blocks_per_sm, sm_needed_kernel)
53 |     sm_needed.append(sm_needed_kernel)
54 | 
55 | 
56 | less = [x for x in  sm_needed if x < 108]
57 | print(len(less), len(sm_needed))
58 | 
59 | df['SM_needed'] = sm_needed
60 | #print(df)
61 | df.to_csv(f'{args.results_dir}/output_ncu_sms.csv', index=0)
62 | 


--------------------------------------------------------------------------------
/profiling/postprocessing/process_ncu.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('--results_dir', type=str, required=True,
 6 |                         help='path to directory containing the profiling files')
 7 | args = parser.parse_args()
 8 | 
 9 | df = pd.read_csv(f'{args.results_dir}/output_ncu.csv', index_col=0)
10 | kernels = []
11 | metrics_to_get = ['Duration', 'Block Size', 'Grid Size', 'Compute (SM) [%]', 'DRAM Throughput', 'Registers Per Thread', 'Static Shared Memory Per Block']
12 | 
13 | unique_kernel_names = set()
14 | 
15 | for index, row in df.iterrows():
16 |     kernel = row['Kernel Name']
17 |     metric_name = row['Metric Name']
18 | 
19 |     if metric_name == 'DRAM Frequency':
20 |         kernels.append([kernel])
21 |         unique_kernel_names.add(kernel)
22 |     elif metric_name in metrics_to_get:
23 |         kernels[-1].append(row['Metric Value'])
24 | 
25 | for x in unique_kernel_names:
26 |     print(x)
27 |     print("------------------------------------")
28 | 
29 | 
30 | for kernel in kernels:
31 |     num_threads = int(kernel[-2]) * int(kernel[-3])
32 |     num_registers = num_threads * int(kernel[-1])
33 |     kernel += [num_threads, num_registers]
34 | 
35 | 
36 | print(len(kernels))
37 | #print(kernels[0])
38 | labels = ['Kernel_Name', 'DRAM_Throughput(%)', 'Duration(ns)', 'Compute(SM)(%)',  'Block', 'Grid', 'Registers_Per_Thread', 'Static_shmem_per_block', 'Number_of_threads', 'Number_of_registers']
39 | 
40 | 
41 | 
42 | df_new = pd.DataFrame(kernels, columns=labels)
43 | print(df_new)
44 | df_new.to_csv(f'{args.results_dir}/output_ncu_processed.csv')
45 | 


--------------------------------------------------------------------------------
/profiling/postprocessing/roofline_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('--results_dir', type=str, required=True,
 6 |                         help='path to directory containing the profiling files')
 7 | parser.add_argument('--ai_threshold', type=float, default=9.72,
 8 |                         help='arithmetic intensity that seperates compute from memory bound kernels')
 9 | args = parser.parse_args()
10 | 
11 | df_raw = pd.read_csv(f'{args.results_dir}/raw_ncu.csv')
12 | 
13 | startp = 0
14 | df_raw = df_raw.iloc[startp:]
15 | 
16 | l = list(df_raw.iloc[0])
17 | print(l)
18 | df_basic = pd.read_csv(f'{args.results_dir}/output_ncu_sms.csv', index_col=0)
19 | 
20 | 
21 | dram_throughput = df_basic['DRAM_Throughput(%)']
22 | comp_throughput = df_basic['Compute(SM)(%)']
23 | 
24 | fadd = 'smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsed [inst/cycle]'
25 | fmul = 'smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsed [inst/cycle]'
26 | ffma = 'smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed [inst/cycle]'
27 | cycles_sec = 'smsp__cycles_elapsed.avg.per_second [cycle/nsecond]'
28 | bytes_sec = 'dram__bytes.sum.per_second [Gbyte/second]'
29 | 
30 | ai_list = []
31 | roofline_prof = [] # 1: comp, 0: mem, -1: invalid
32 | 
33 | comp_bound = 0
34 | mem_bound = 0
35 | rest = 0
36 | 
37 | for index, row in df_raw.iterrows():
38 |     add = str(row[fadd])
39 |     mul = str(row[fmul])
40 |     fma = row[ffma]
41 |     cycles = row[cycles_sec]
42 |     bytes = row[bytes_sec]
43 |     #print(add, mul, fma, cycles, bytes)
44 | 
45 |     if not isinstance(fma, float):
46 |         fma = float(fma.replace("'", ''))
47 |     add = float(add.replace("'", ''))
48 |     mul = float(mul.replace("'", ''))
49 | 
50 | 
51 |     if add or mul or fma:
52 |         flops_cycle = add+mul+fma*2
53 |         flops_sec = flops_cycle * cycles
54 |         ai = flops_sec/bytes
55 |         ai_list.append(ai)
56 |         print(index, ai)
57 |         if ai > args.ai_threshold:
58 |             roofline_prof.append(1)
59 |             comp_bound += 1
60 |         else:
61 |             roofline_prof.append(0)
62 |             mem_bound += 1
63 |     else:
64 |         ai_list.append(0.0)
65 |         if comp_throughput[index-startp] >= 60.0:
66 |             roofline_prof.append(1)
67 |         elif dram_throughput[index-startp] >= 60.0:
68 |             roofline_prof.append(0)
69 |         else:
70 |             roofline_prof.append(-1)
71 |         rest += 1
72 | 
73 | 
74 | print(df_basic)
75 | df_basic['AI(flops/bytes)'] = ai_list
76 | df_basic['Roofline_prof'] = roofline_prof
77 | df_basic.to_csv(f'{args.results_dir}/output_ncu_sms_roofline.csv')
78 | 
79 | print(f"comp bound: {comp_bound}, mem bound: {mem_bound}, rest: {rest}, total: {comp_bound+mem_bound+rest}")
80 | 


--------------------------------------------------------------------------------
/related/Tick-Tock/test.json:
--------------------------------------------------------------------------------
1 | {
2 |     "duration0": 96.88879942893982,
3 |     "duration": 96.88879942893982
4 | }


--------------------------------------------------------------------------------
/related/baselines/README.md:
--------------------------------------------------------------------------------
 1 | # GPU Sharing Baselines
 2 | This directory contains evaluations of GPU sharing techniques between two workloads.
 3 | Supported baselines are `MPS`, `TickTock`, `Streams`, `Isolated`, and `Sequential`.
 4 | 
 5 | [main.py](./main.py) is the entry point of the evaluation and the all configurations are in [config.yaml](./config.yaml).
 6 | 
 7 | To evaluate a baseline, change the `policy` field in `config.yaml` to the baseline name.
 8 | Then, run `python main.py --config config.yaml`.
 9 | 
10 | If no `--config` argument is provided, [config.yaml](./config.yaml) is used by default.
11 | 
12 | 
13 | ## Supported Baselines
14 | ### MPS
15 | MPS: [Multi-Process Service (MPS)](https://docs.nvidia.com/deploy/mps/index.html) is a feature of NVIDIA GPUs that allows multiple processes to share a single GPU.
16 | 
17 | **Caveat!** There are extra steps to do before executing the python program:
18 | 1. Execute `./start_MPS_control_daemon.sh` to start the MPS server.
19 | 2. Export these two environment variables:
20 | ```shell
21 | export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
22 | export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
23 | ```
24 | 3. Within the same shell session where you exported the environment variables, execute the python program normally.
25 | 
26 | ### TICK-TOCK scheduling
27 | 
28 | This directory contains a basic implementation of TICK-TOCK scheduling using Python threads, and torch.cuda streams and events.
29 | It is based on the description provided in [WAVELET: EFFICIENT DNN TRAINING WITH TICK-TOCK SCHEDULING (MLSys'21)](https://proceedings.mlsys.org/paper/2021/file/c81e728d9d4c2f636f067f89cc14862c-Paper.pdf).
30 | 
31 | What would be an interesting next step is implementing the memory management support described in [Zico: Efficient GPU Memory Sharing for
32 | Concurrent DNN Training (ATC'21)](https://www.usenix.org/system/files/atc21-lim.pdf).
33 | 
34 | ### Streams
35 | GPU Streams provide a way to execute workloads concurrently on a single GPU.
36 | One stream captures a linear sequence of operations to be executed, and multiple streams can be executed concurrently.
37 | 
38 | ### Sequential
39 | `Sequential` represents the temporal sharing baseline where the GPU is time-sliced between the two workloads.
40 | 
41 | ### Isolated
42 | To analyze the overhead of GPU sharing, we compare the performance of GPU sharing with the performance of executing 
43 | the workload on a single GPU without sharing. For `Isolated` we first execute workload A and then workload B after A is finished.
44 | 


--------------------------------------------------------------------------------
/related/baselines/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/bert/__init__.py


--------------------------------------------------------------------------------
/related/baselines/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | policy: "MPS" # "MPS", "TickTock", "Streams", "Isolated", or "Sequential"
 3 | models:
 4 |   model0:
 5 |     mode: eval # train or eval
 6 |     name: resnet50 # these two names should strictly correspond to the model names below
 7 |   model1:
 8 |      mode: train # train or eval
 9 |      name: mobilenet_v2
10 | shared_config:
11 |   distribution: poisson # poisson, uniform, or trace
12 |   trace_path: './inter_arrival_times.json' # only used when distribution is trace
13 |   pin_memory: true
14 |   seed: 42
15 | 
16 | # configuration for each model
17 | resnet50:
18 |   arch: resnet50
19 |   batch_size: 4
20 |   num_iterations: 100
21 |   request_rate: 15 # measured in 1/seconds. If 0 it means no sleep
22 | resnet101:
23 |   arch: resnet101
24 |   batch_size: 32
25 |   num_iterations: 500
26 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
27 | mobilenet_v2:
28 |   arch: mobilenet_v2
29 |   batch_size: 64
30 |   num_iterations: 10000
31 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
32 | bert:
33 |   batch_size: 8
34 |   arch: base # either base or large
35 |   num_iterations: 500
36 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
37 | #  large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16'
38 | #  base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12'
39 | transformer:
40 |   arch: base # either base or large
41 |   batch_size: 8
42 |   num_iterations: 500
43 |   request_rate: 0 # measured in 1/seconds. If 0 it means no sleep
44 | 
45 | 
46 | 
47 | 
48 | resnet50-1:
49 |   arch: resnet50
50 |   batch_size: 32
51 |   num_iterations: 1000
52 |   request_rate: 80 # measured in 1/seconds. If 0 it means no sleep
53 | resnet101-1:
54 |   arch: resnet101
55 |   batch_size: 32
56 |   num_iterations: 1000
57 |   request_rate: 40 # measured in 1/seconds. If 0 it means no sleep
58 | mobilenet_v2-1:
59 |   arch: mobilenet_v2
60 |   batch_size: 64
61 |   num_iterations: 1000
62 |   request_rate: 100 # measured in 1/seconds. If 0 it means no sleep
63 | bert-1:
64 |   batch_size: 8
65 |   arch: base # either base or large
66 |   num_iterations: 1000
67 |   request_rate: 8 # measured in 1/seconds. If 0 it means no sleep
68 | #  large_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16'
69 | #  base_model_dir: '/mnt/disks/disk-imagenet-gpu-share/home/fot/bert/download/google_pretrained_weights/uncased_L-12_H-768_A-12'
70 | transformer-1:
71 |   arch: base # either base or large
72 |   batch_size: 8
73 |   num_iterations: 1000
74 |   request_rate: 20 # measured in 1/seconds. If 0 it means no sleep
75 | 


--------------------------------------------------------------------------------
/related/baselines/dcgan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/dcgan/__init__.py


--------------------------------------------------------------------------------
/related/baselines/dcgan/dcgan.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | # code from https://github.com/pytorch/examples/blob/main/dcgan/main.py
 5 | 
 6 | def weights_init(m):
 7 |     classname = m.__class__.__name__
 8 |     if classname.find('Conv') != -1:
 9 |         torch.nn.init.normal_(m.weight, 0.0, 0.02)
10 |     elif classname.find('BatchNorm') != -1:
11 |         torch.nn.init.normal_(m.weight, 1.0, 0.02)
12 |         torch.nn.init.zeros_(m.bias)
13 | 
14 | # only training on one gpu
15 | # ngf = number of filters in the generator
16 | # nz = size of the latent z vector
17 | # nc = number of channels
18 | class Generator(nn.Module):
19 |     def __init__(self, ngf: int, nc: int, nz: int):
20 |         super(Generator, self).__init__()
21 |         self.main = nn.Sequential(
22 |             # input is Z, going into a convolution
23 |             nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
24 |             nn.BatchNorm2d(ngf * 8),
25 |             nn.ReLU(True),
26 |             # state size. (ngf*8) x 4 x 4
27 |             nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
28 |             nn.BatchNorm2d(ngf * 4),
29 |             nn.ReLU(True),
30 |             # state size. (ngf*4) x 8 x 8
31 |             nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
32 |             nn.BatchNorm2d(ngf * 2),
33 |             nn.ReLU(True),
34 |             # state size. (ngf*2) x 16 x 16
35 |             nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
36 |             nn.BatchNorm2d(ngf),
37 |             nn.ReLU(True),
38 |             # state size. (ngf) x 32 x 32
39 |             nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
40 |             nn.Tanh()
41 |             # state size. (nc) x 64 x 64
42 |         )
43 | 
44 |     def forward(self, input):
45 |         output = self.main(input)
46 |         return output
47 | 
48 | # ndf = number of filters in the discriminator
49 | # nc = number of channels
50 | class Discriminator(nn.Module):
51 |     def __init__(self, ndf, nc):
52 |         super(Discriminator, self).__init__()
53 |         self.main = nn.Sequential(
54 |             # input is (nc) x 64 x 64
55 |             nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
56 |             nn.LeakyReLU(0.2, inplace=True),
57 |             # state size. (ndf) x 32 x 32
58 |             nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
59 |             nn.BatchNorm2d(ndf * 2),
60 |             nn.LeakyReLU(0.2, inplace=True),
61 |             # state size. (ndf*2) x 16 x 16
62 |             nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
63 |             nn.BatchNorm2d(ndf * 4),
64 |             nn.LeakyReLU(0.2, inplace=True),
65 |             # state size. (ndf*4) x 8 x 8
66 |             nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
67 |             nn.BatchNorm2d(ndf * 8),
68 |             nn.LeakyReLU(0.2, inplace=True),
69 |             # state size. (ndf*8) x 4 x 4
70 |             nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
71 |             nn.Sigmoid()
72 |         )
73 | 
74 |     def forward(self, input):
75 |         output = self.main(input)
76 |         return output.view(-1, 1).squeeze(1)
77 | 
78 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/gnmt/__init__.py


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/data/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017 Elad Hoffer
 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all
12 | # copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | # SOFTWARE.
21 | 
22 | PAD_TOKEN = '<pad>'
23 | UNK_TOKEN = '<unk>'
24 | BOS_TOKEN = '<s>'
25 | EOS_TOKEN = '<\s>'
26 | 
27 | # special PAD, UNKNOWN, BEGIN-OF-STRING, END-OF-STRING tokens
28 | PAD, UNK, BOS, EOS = [0, 1, 2, 3]
29 | 
30 | # path to the moses detokenizer, relative to the data directory
31 | DETOKENIZER = 'mosesdecoder/scripts/tokenizer/detokenizer.perl'
32 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/gpu_affinity.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import math
  3 | import os
  4 | import pathlib
  5 | import re
  6 | 
  7 | import pynvml
  8 | 
  9 | pynvml.nvmlInit()
 10 | 
 11 | 
 12 | def systemGetDriverVersion():
 13 |     return pynvml.nvmlSystemGetDriverVersion()
 14 | 
 15 | 
 16 | def deviceGetCount():
 17 |     return pynvml.nvmlDeviceGetCount()
 18 | 
 19 | 
 20 | class device:
 21 |     # assume nvml returns list of 64 bit ints
 22 |     _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
 23 | 
 24 |     def __init__(self, device_idx):
 25 |         super().__init__()
 26 |         self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
 27 | 
 28 |     def getName(self):
 29 |         return pynvml.nvmlDeviceGetName(self.handle)
 30 | 
 31 |     def getCpuAffinity(self):
 32 |         affinity_string = ''
 33 |         for j in pynvml.nvmlDeviceGetCpuAffinity(
 34 |             self.handle, device._nvml_affinity_elements
 35 |         ):
 36 |             # assume nvml returns list of 64 bit ints
 37 |             affinity_string = '{:064b}'.format(j) + affinity_string
 38 |         affinity_list = [int(x) for x in affinity_string]
 39 |         affinity_list.reverse()  # so core 0 is in 0th element of list
 40 | 
 41 |         ret = [i for i, e in enumerate(affinity_list) if e != 0]
 42 |         return ret
 43 | 
 44 | 
 45 | def set_socket_affinity(gpu_id):
 46 |     dev = device(gpu_id)
 47 |     affinity = dev.getCpuAffinity()
 48 |     os.sched_setaffinity(0, affinity)
 49 | 
 50 | 
 51 | def set_single_affinity(gpu_id):
 52 |     dev = device(gpu_id)
 53 |     affinity = dev.getCpuAffinity()
 54 |     os.sched_setaffinity(0, affinity[:1])
 55 | 
 56 | 
 57 | def set_single_unique_affinity(gpu_id, nproc_per_node):
 58 |     devices = [device(i) for i in range(nproc_per_node)]
 59 |     socket_affinities = [dev.getCpuAffinity() for dev in devices]
 60 | 
 61 |     siblings_list = get_thread_siblings_list()
 62 |     siblings_dict = dict(siblings_list)
 63 | 
 64 |     # remove siblings
 65 |     for idx, socket_affinity in enumerate(socket_affinities):
 66 |         socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
 67 | 
 68 |     affinities = []
 69 |     assigned = []
 70 | 
 71 |     for socket_affinity in socket_affinities:
 72 |         for core in socket_affinity:
 73 |             if core not in assigned:
 74 |                 affinities.append([core])
 75 |                 assigned.append(core)
 76 |                 break
 77 |     os.sched_setaffinity(0, affinities[gpu_id])
 78 | 
 79 | 
 80 | def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
 81 |     device_ids = [device(i) for i in range(nproc_per_node)]
 82 |     socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
 83 | 
 84 |     siblings_list = get_thread_siblings_list()
 85 |     siblings_dict = dict(siblings_list)
 86 | 
 87 |     # remove siblings
 88 |     for idx, socket_affinity in enumerate(socket_affinities):
 89 |         socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
 90 | 
 91 |     socket_affinities_to_device_ids = collections.defaultdict(list)
 92 | 
 93 |     for idx, socket_affinity in enumerate(socket_affinities):
 94 |         socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
 95 | 
 96 |     for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
 97 |         devices_per_group = len(device_ids)
 98 |         cores_per_device = len(socket_affinity) // devices_per_group
 99 |         for group_id, device_id in enumerate(device_ids):
100 |             if device_id == gpu_id:
101 |                 if mode == 'interleaved':
102 |                     affinity = list(socket_affinity[group_id::devices_per_group])
103 |                 elif mode == 'continuous':
104 |                     affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
105 |                 else:
106 |                     raise RuntimeError('Unknown set_socket_unique_affinity mode')
107 | 
108 |                 # reintroduce siblings
109 |                 affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
110 |                 os.sched_setaffinity(0, affinity)
111 | 
112 | 
113 | def get_thread_siblings_list():
114 |     path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
115 |     thread_siblings_list = []
116 |     pattern = re.compile(r'(\d+)\D(\d+)')
117 |     for fname in pathlib.Path(path[0]).glob(path[1:]):
118 |         with open(fname) as f:
119 |             content = f.read().strip()
120 |             res = pattern.findall(content)
121 |             if res:
122 |                 pair = tuple(map(int, res[0]))
123 |                 thread_siblings_list.append(pair)
124 |     return thread_siblings_list
125 | 
126 | 
127 | def set_affinity(gpu_id, nproc_per_node, mode='socket'):
128 |     if mode == 'socket':
129 |         set_socket_affinity(gpu_id)
130 |     elif mode == 'single':
131 |         set_single_affinity(gpu_id)
132 |     elif mode == 'single_unique':
133 |         set_single_unique_affinity(gpu_id, nproc_per_node)
134 |     elif mode == 'socket_unique_interleaved':
135 |         set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
136 |     elif mode == 'socket_unique_continuous':
137 |         set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
138 |     else:
139 |         raise RuntimeError('Unknown affinity mode')
140 | 
141 |     affinity = os.sched_getaffinity(0)
142 |     return affinity
143 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/inference/tables.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | # of this software and associated documentation files (the "Software"), to deal
  5 | # in the Software without restriction, including without limitation the rights
  6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7 | # copies of the Software, and to permit persons to whom the Software is
  8 | # furnished to do so, subject to the following conditions:
  9 | #
 10 | # The above copyright notice and this permission notice shall be included in all
 11 | # copies or substantial portions of the Software.
 12 | #
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 19 | # SOFTWARE.
 20 | 
 21 | import collections
 22 | import itertools
 23 | 
 24 | import numpy as np
 25 | from pytablewriter import MarkdownTableWriter
 26 | 
 27 | 
 28 | def interleave(*args):
 29 |     return list(itertools.chain(*zip(*args)))
 30 | 
 31 | 
 32 | class AccuracyTable:
 33 |     def __init__(self, unit):
 34 |         self.data = collections.defaultdict(dict)
 35 |         self.unit = unit
 36 | 
 37 |     def add(self, key, data):
 38 |         self.data[key].update(data)
 39 | 
 40 |     def write(self, title, write_math):
 41 |         writer = MarkdownTableWriter()
 42 |         writer.table_name = f'{title}'
 43 |         main_header = ['**Batch Size**', '**Beam Size**']
 44 |         data_header = []
 45 |         if 'fp32' in write_math:
 46 |             data_header += [f'**Accuracy - FP32 ({self.unit})**']
 47 |         if 'tf32' in write_math:
 48 |             data_header += [f'**Accuracy - TF32 ({self.unit})**']
 49 |         if 'fp16' in write_math:
 50 |             data_header += [f'**Accuracy - FP16 ({self.unit})**']
 51 |         writer.headers = main_header + data_header
 52 | 
 53 |         writer.value_matrix = []
 54 |         for k, v in self.data.items():
 55 |             batch_size, beam_size = k
 56 |             row = [batch_size, beam_size]
 57 |             if 'fp32' in write_math:
 58 |                 row.append(v['fp32'])
 59 |             if 'tf32' in write_math:
 60 |                 row.append(v['tf32'])
 61 |             if 'fp16' in write_math:
 62 |                 row.append(v['fp16'])
 63 |             writer.value_matrix.append(row)
 64 |         writer.write_table()
 65 | 
 66 | 
 67 | class PerformanceTable:
 68 |     def __init__(self, percentiles, unit, reverse_percentiles=False):
 69 |         self.percentiles = percentiles
 70 |         self.data = collections.defaultdict(dict)
 71 |         self.unit = unit
 72 |         self.reverse_percentiles = reverse_percentiles
 73 | 
 74 |     def add(self, key, value):
 75 |         math, value = next(iter(value.items()))
 76 |         value = np.array(value)
 77 | 
 78 |         if self.reverse_percentiles:
 79 |             percentiles = [100 - p for p in self.percentiles]
 80 |         else:
 81 |             percentiles = self.percentiles
 82 | 
 83 |         stats = []
 84 |         for p in percentiles:
 85 |             val = np.percentile(value, p)
 86 |             stats.append(val * self.unit_convert[self.unit])
 87 | 
 88 |         avg = value.mean() * self.unit_convert[self.unit]
 89 | 
 90 |         self.data[key].update({math: (avg, stats)})
 91 | 
 92 |     def write(self, title, math, relative=None, reverse_speedup=False):
 93 |         writer = MarkdownTableWriter()
 94 |         writer.table_name = f'{title} - {math.upper()}'
 95 |         main_header = ['**Batch Size**', '**Beam Size**']
 96 |         data_header = [f'**Avg ({self.unit})**']
 97 |         data_header += [f'**{p}% ({self.unit})**' for p in self.percentiles]
 98 | 
 99 |         if relative:
100 |             speedup_header = ['**Speedup**'] * len(data_header)
101 |             data_header = interleave(data_header, speedup_header)
102 | 
103 |         writer.headers = main_header + data_header
104 | 
105 |         writer.value_matrix = []
106 |         for k, v in self.data.items():
107 |             batch_size, beam_size = k
108 |             avg, res_percentiles = v[math]
109 |             main = [batch_size, beam_size]
110 |             data = [avg, *res_percentiles]
111 | 
112 |             if relative:
113 |                 rel = self.data[k][relative]
114 |                 rel_avg, rel_res_percentiles = rel
115 |                 rel = [rel_avg, *rel_res_percentiles]
116 |                 speedup = [d / r for (r, d) in zip(rel, data)]
117 |                 if reverse_speedup:
118 |                     speedup = [1 / s for s in speedup]
119 |                 data = interleave(data, speedup)
120 | 
121 |             writer.value_matrix.append(main + data)
122 |         writer.write_table()
123 | 
124 | 
125 | class LatencyTable(PerformanceTable):
126 |     def __init__(self, percentiles, unit='ms'):
127 |         super().__init__(percentiles, unit)
128 |         self.unit_convert = {'s': 1, 'ms': 1e3, 'us': 1e6}
129 | 
130 | 
131 | class ThroughputTable(PerformanceTable):
132 |     def __init__(self, percentiles, unit='tok/s', reverse_percentiles=True):
133 |         super().__init__(percentiles, unit, reverse_percentiles)
134 |         self.unit_convert = {'tok/s': 1}
135 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/models/encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017 Elad Hoffer
  2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in all
 12 | # copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | # SOFTWARE.
 21 | 
 22 | import torch.nn as nn
 23 | from torch.nn.utils.rnn import pack_padded_sequence
 24 | from torch.nn.utils.rnn import pad_packed_sequence
 25 | 
 26 | import gnmt.seq2seq.data.config as config
 27 | from gnmt.seq2seq.utils import init_lstm_
 28 | 
 29 | 
 30 | class ResidualRecurrentEncoder(nn.Module):
 31 |     """
 32 |     Encoder with Embedding, LSTM layers, residual connections and optional
 33 |     dropout.
 34 | 
 35 |     The first LSTM layer is bidirectional and uses variable sequence length
 36 |     API, the remaining (num_layers-1) layers are unidirectional. Residual
 37 |     connections are enabled after third LSTM layer, dropout is applied on
 38 |     inputs to LSTM layers.
 39 |     """
 40 |     def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
 41 |                  batch_first=False, embedder=None, init_weight=0.1):
 42 |         """
 43 |         Constructor for the ResidualRecurrentEncoder.
 44 | 
 45 |         :param vocab_size: size of vocabulary
 46 |         :param hidden_size: hidden size for LSTM layers
 47 |         :param num_layers: number of LSTM layers, 1st layer is bidirectional
 48 |         :param dropout: probability of dropout (on input to LSTM layers)
 49 |         :param batch_first: if True the model uses (batch,seq,feature) tensors,
 50 |             if false the model uses (seq, batch, feature)
 51 |         :param embedder: instance of nn.Embedding, if None constructor will
 52 |             create new embedding layer
 53 |         :param init_weight: range for the uniform initializer
 54 |         """
 55 |         super(ResidualRecurrentEncoder, self).__init__()
 56 |         self.batch_first = batch_first
 57 |         self.rnn_layers = nn.ModuleList()
 58 |         # 1st LSTM layer, bidirectional
 59 |         self.rnn_layers.append(
 60 |             nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
 61 |                     batch_first=batch_first, bidirectional=True))
 62 | 
 63 |         # 2nd LSTM layer, with 2x larger input_size
 64 |         self.rnn_layers.append(
 65 |             nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True,
 66 |                     batch_first=batch_first))
 67 | 
 68 |         # Remaining LSTM layers
 69 |         for _ in range(num_layers - 2):
 70 |             self.rnn_layers.append(
 71 |                 nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
 72 |                         batch_first=batch_first))
 73 | 
 74 |         for lstm in self.rnn_layers:
 75 |             init_lstm_(lstm, init_weight)
 76 | 
 77 |         self.dropout = nn.Dropout(p=dropout)
 78 | 
 79 |         if embedder is not None:
 80 |             self.embedder = embedder
 81 |         else:
 82 |             self.embedder = nn.Embedding(vocab_size, hidden_size,
 83 |                                          padding_idx=config.PAD)
 84 |             nn.init.uniform_(self.embedder.weight.data, -init_weight,
 85 |                              init_weight)
 86 | 
 87 |     def forward(self, inputs, lengths):
 88 |         """
 89 |         Execute the encoder.
 90 | 
 91 |         :param inputs: tensor with indices from the vocabulary
 92 |         :param lengths: vector with sequence lengths (excluding padding)
 93 | 
 94 |         returns: tensor with encoded sequences
 95 |         """
 96 |         x = self.embedder(inputs)
 97 | 
 98 |         # bidirectional layer
 99 |         x = self.dropout(x)
100 |         x = pack_padded_sequence(x, lengths.cpu(),
101 |                                  batch_first=self.batch_first)
102 |         x, _ = self.rnn_layers[0](x)
103 |         x, _ = pad_packed_sequence(x, batch_first=self.batch_first)
104 | 
105 |         # 1st unidirectional layer
106 |         x = self.dropout(x)
107 |         x, _ = self.rnn_layers[1](x)
108 | 
109 |         # the rest of unidirectional layers,
110 |         # with residual connections starting from 3rd layer
111 |         for i in range(2, len(self.rnn_layers)):
112 |             residual = x
113 |             x = self.dropout(x)
114 |             x, _ = self.rnn_layers[i](x)
115 |             x = x + residual
116 | 
117 |         return x
118 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/models/gnmt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017 Elad Hoffer
 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all
12 | # copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | # SOFTWARE.
21 | 
22 | import torch.nn as nn
23 | 
24 | import gnmt.seq2seq.data.config as config
25 | from gnmt.seq2seq.models.decoder import ResidualRecurrentDecoder
26 | from gnmt.seq2seq.models.encoder import ResidualRecurrentEncoder
27 | from gnmt.seq2seq.models.seq2seq_base import Seq2Seq
28 | 
29 | 
30 | class GNMT(Seq2Seq):
31 |     """
32 |     GNMT v2 model
33 |     """
34 |     def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
35 |                  batch_first=False, share_embedding=True):
36 |         """
37 |         Constructor for the GNMT v2 model.
38 | 
39 |         :param vocab_size: size of vocabulary (number of tokens)
40 |         :param hidden_size: internal hidden size of the model
41 |         :param num_layers: number of layers, applies to both encoder and
42 |             decoder
43 |         :param dropout: probability of dropout (in encoder and decoder)
44 |         :param batch_first: if True the model uses (batch,seq,feature) tensors,
45 |             if false the model uses (seq, batch, feature)
46 |         :param share_embedding: if True embeddings are shared between encoder
47 |             and decoder
48 |         """
49 | 
50 |         super(GNMT, self).__init__(batch_first=batch_first)
51 | 
52 |         if share_embedding:
53 |             embedder = nn.Embedding(vocab_size, hidden_size,
54 |                                     padding_idx=config.PAD)
55 |             nn.init.uniform_(embedder.weight.data, -0.1, 0.1)
56 |         else:
57 |             embedder = None
58 | 
59 |         self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size,
60 |                                                 num_layers, dropout,
61 |                                                 batch_first, embedder)
62 | 
63 |         self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size,
64 |                                                 num_layers, dropout,
65 |                                                 batch_first, embedder)
66 | 
67 |     def forward(self, input_encoder, input_enc_len, input_decoder):
68 |         context = self.encode(input_encoder, input_enc_len)
69 |         context = (context, input_enc_len, None)
70 |         output, _, _ = self.decode(input_decoder, context)
71 | 
72 |         return output
73 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/models/seq2seq_base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017 Elad Hoffer
 2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all
12 | # copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | # SOFTWARE.
21 | 
22 | import torch.nn as nn
23 | from torch.nn.functional import log_softmax
24 | 
25 | 
26 | class Seq2Seq(nn.Module):
27 |     """
28 |     Generic Seq2Seq module, with an encoder and a decoder.
29 |     """
30 |     def __init__(self, encoder=None, decoder=None, batch_first=False):
31 |         """
32 |         Constructor for the Seq2Seq module.
33 | 
34 |         :param encoder: encoder module
35 |         :param decoder: decoder module
36 |         :param batch_first: if True the model uses (batch, seq, feature)
37 |             tensors, if false the model uses (seq, batch, feature) tensors
38 |         """
39 |         super(Seq2Seq, self).__init__()
40 |         self.encoder = encoder
41 |         self.decoder = decoder
42 |         self.batch_first = batch_first
43 | 
44 |     def encode(self, inputs, lengths):
45 |         """
46 |         Applies the encoder to inputs with a given input sequence lengths.
47 | 
48 |         :param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
49 |             else (seq_len, batch)
50 |         :param lengths: vector with sequence lengths (excluding padding)
51 |         """
52 |         return self.encoder(inputs, lengths)
53 | 
54 |     def decode(self, inputs, context, inference=False):
55 |         """
56 |         Applies the decoder to inputs, given the context from the encoder.
57 | 
58 |         :param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
59 |             else (seq_len, batch)
60 |         :param context: context from the encoder
61 |         :param inference: if True inference mode, if False training mode
62 |         """
63 |         return self.decoder(inputs, context, inference)
64 | 
65 |     def generate(self, inputs, context, beam_size):
66 |         """
67 |         Autoregressive generator, works with SequenceGenerator class.
68 |         Executes decoder (in inference mode), applies log_softmax and topK for
69 |         inference with beam search decoding.
70 | 
71 |         :param inputs: tensor with inputs to the decoder
72 |         :param context: context from the encoder
73 |         :param beam_size: beam size for the generator
74 | 
75 |         returns: (words, logprobs, scores, new_context)
76 |             words: indices of topK tokens
77 |             logprobs: log probabilities of topK tokens
78 |             scores: scores from the attention module (for coverage penalty)
79 |             new_context: new decoder context, includes new hidden states for
80 |                 decoder RNN cells
81 |         """
82 |         logits, scores, new_context = self.decode(inputs, context, True)
83 |         logprobs = log_softmax(logits, dim=-1)
84 |         logprobs, words = logprobs.topk(beam_size, dim=-1)
85 |         return words, logprobs, scores, new_context
86 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/train/smoothing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | # SOFTWARE.
20 | 
21 | import torch
22 | import torch.nn as nn
23 | 
24 | 
25 | class LabelSmoothing(nn.Module):
26 |     """
27 |     NLL loss with label smoothing.
28 |     """
29 |     def __init__(self, padding_idx, smoothing=0.0):
30 |         """
31 |         Constructor for the LabelSmoothing module.
32 | 
33 |         :param padding_idx: index of the PAD token
34 |         :param smoothing: label smoothing factor
35 |         """
36 |         super(LabelSmoothing, self).__init__()
37 |         self.padding_idx = padding_idx
38 |         self.confidence = 1.0 - smoothing
39 |         self.smoothing = smoothing
40 | 
41 |     def forward(self, x, target):
42 |         logprobs = torch.nn.functional.log_softmax(x, dim=-1,
43 |                                                    dtype=torch.float32)
44 | 
45 |         non_pad_mask = (target != self.padding_idx)
46 |         nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
47 |         nll_loss = nll_loss.squeeze(1)[non_pad_mask]
48 |         smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask]
49 |         loss = self.confidence * nll_loss + self.smoothing * smooth_loss
50 |         return loss.sum()
51 | 


--------------------------------------------------------------------------------
/related/baselines/gnmt/seq2seq/train/table.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | # SOFTWARE.
20 | 
21 | from pytablewriter import MarkdownTableWriter
22 | 
23 | 
24 | class TrainingTable:
25 |     def __init__(self, acc_unit='BLEU', time_unit='min', perf_unit='tok/s'):
26 |         self.data = []
27 |         self.acc_unit = acc_unit
28 |         self.time_unit = time_unit
29 |         self.perf_unit = perf_unit
30 |         self.time_unit_convert = {'s': 1, 'min': 1/60, 'h': 1/3600}
31 | 
32 |     def add(self, gpus, batch_size, accuracy, perf, time_to_train):
33 |         time_to_train *= self.time_unit_convert[self.time_unit]
34 |         if not accuracy:
35 |             accuracy = 0.0
36 |         accuracy = round(accuracy, 2)
37 |         self.data.append([gpus, batch_size, accuracy, perf, time_to_train])
38 | 
39 |     def write(self, title, math):
40 |         writer = MarkdownTableWriter()
41 |         writer.table_name = f'{title}'
42 | 
43 |         header = [f'**GPUs**',
44 |                   f'**Batch Size / GPU**',
45 |                   f'**Accuracy - {math.upper()} ({self.acc_unit})**',
46 |                   f'**Throughput - {math.upper()} ({self.perf_unit})**',
47 |                   f'**Time to Train - {math.upper()} ({self.time_unit})**',
48 |                   ]
49 |         writer.headers = header
50 | 
51 |         writer.value_matrix = self.data
52 |         writer.write_table()
53 | 


--------------------------------------------------------------------------------
/related/baselines/nasnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/nasnet/__init__.py


--------------------------------------------------------------------------------
/related/baselines/nasnet/train_nasnet.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from torchvision import models, datasets, transforms
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from nasnet.nasnet import NASNetALarge
 6 | from nasnet.nasnet_mobile import NASNetAMobile
 7 | from utils.sync_control import *
 8 | 
 9 | 
10 | def train_wrapper(sync_info, tid: int, model_config, shared_config):
11 |     device = torch.device("cuda:0")
12 |     my_stream = torch.cuda.Stream(device=device)
13 |     arc = model_config['arc']
14 |     model = NASNetALarge(num_classes=1000) if arc == 'large' else NASNetAMobile(num_classes=1000)
15 |     model = model.to(device)
16 |     model.train()
17 | 
18 |     train_transform = transforms.Compose([
19 |         transforms.RandomResizedCrop(331 if arc == 'large' else 224),
20 |         transforms.RandomHorizontalFlip(),
21 |         transforms.ToTensor(),
22 |         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
23 | 
24 |     train_dataset = \
25 |         datasets.ImageFolder(shared_config['imagenet_root'], transform=train_transform)
26 | 
27 |     train_loader = torch.utils.data.DataLoader(
28 |         train_dataset, batch_size=model_config['batch_size'], shuffle=True, num_workers=model_config['num_workers'])
29 |     metric_fn = F.cross_entropy
30 |     optimizer_func = getattr(torch.optim, model_config['optimizer'])
31 |     optimizer = optimizer_func(model.parameters(), lr=0.001)
32 | 
33 |     for batch_idx, batch in enumerate(train_loader):
34 |         data, target = batch[0].to(device), batch[1].to(device)
35 |         with ForwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream):
36 |             with torch.cuda.stream(my_stream):
37 |                 output = model(data)
38 |                 loss = metric_fn(output, target)
39 | 
40 |         with BackwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream):
41 |             with torch.cuda.stream(my_stream):
42 |                 loss.backward()
43 |                 optimizer.step()
44 |                 optimizer.zero_grad()
45 | 


--------------------------------------------------------------------------------
/related/baselines/requirements.txt:
--------------------------------------------------------------------------------
1 | pycocotools
2 | wget
3 | 


--------------------------------------------------------------------------------
/related/baselines/retinanet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/retinanet/__init__.py


--------------------------------------------------------------------------------
/related/baselines/retinanet/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/retinanet/model/__init__.py


--------------------------------------------------------------------------------
/related/baselines/retinanet/model/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def sigmoid_focal_loss(
 6 |     inputs: torch.Tensor,
 7 |     targets: torch.Tensor,
 8 |     alpha: float = 0.25,
 9 |     gamma: float = 2,
10 |     reduction: str = "none",
11 | ):
12 |     """
13 |     Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py .
14 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
15 | 
16 |     Args:
17 |         inputs: A float tensor of arbitrary shape.
18 |                 The predictions for each example.
19 |         targets: A float tensor with the same shape as inputs. Stores the binary
20 |                 classification label for each element in inputs
21 |                 (0 for the negative class and 1 for the positive class).
22 |         alpha: (optional) Weighting factor in range (0,1) to balance
23 |                 positive vs negative examples or -1 for ignore. Default = 0.25
24 |         gamma: Exponent of the modulating factor (1 - p_t) to
25 |                balance easy vs hard examples.
26 |         reduction: 'none' | 'mean' | 'sum'
27 |                  'none': No reduction will be applied to the output.
28 |                  'mean': The output will be averaged.
29 |                  'sum': The output will be summed.
30 |     Returns:
31 |         Loss tensor with the reduction option applied.
32 |     """
33 |     p = torch.sigmoid(inputs)
34 |     ce_loss = F.binary_cross_entropy_with_logits(
35 |         inputs, targets, reduction="none"
36 |     )
37 |     p_t = p * targets + (1 - p) * (1 - targets)
38 |     loss = ce_loss * ((1 - p_t) ** gamma)
39 | 
40 |     if alpha >= 0:
41 |         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
42 |         loss = alpha_t * loss
43 | 
44 |     if reduction == "mean":
45 |         loss = loss.mean()
46 |     elif reduction == "sum":
47 |         loss = loss.sum()
48 | 
49 |     return loss
50 | 


--------------------------------------------------------------------------------
/related/baselines/retinanet/model/image_list.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from typing import List, Tuple
 4 | 
 5 | 
 6 | class ImageList(object):
 7 |     """
 8 |     Structure that holds a list of images (of possibly
 9 |     varying sizes) as a single tensor.
10 |     This works by padding the images to the same size,
11 |     and storing in a field the original sizes of each image
12 |     """
13 | 
14 |     def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]):
15 |         """
16 |         Args:
17 |             tensors (tensor)
18 |             image_sizes (list[tuple[int, int]])
19 |         """
20 |         self.tensors = tensors
21 |         self.image_sizes = image_sizes
22 | 
23 |     def to(self, device: torch.device) -> 'ImageList':
24 |         cast_tensor = self.tensors.to(device)
25 |         return ImageList(cast_tensor, self.image_sizes)
26 | 


--------------------------------------------------------------------------------
/related/baselines/retinanet/model/roi_heads.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | 
 4 | import torch.nn.functional as F
 5 | from torch import nn, Tensor
 6 | 
 7 | from torchvision.ops import boxes as box_ops
 8 | from torchvision.ops import roi_align
 9 | 
10 | from typing import Optional, List, Dict, Tuple
11 | 
12 | from retinanet.model.utils import BoxCoder, Matcher
13 | 
14 | 
15 | def expand_boxes(boxes, scale):
16 |     # type: (Tensor, float) -> Tensor
17 |     w_half = (boxes[:, 2] - boxes[:, 0]) * .5
18 |     h_half = (boxes[:, 3] - boxes[:, 1]) * .5
19 |     x_c = (boxes[:, 2] + boxes[:, 0]) * .5
20 |     y_c = (boxes[:, 3] + boxes[:, 1]) * .5
21 | 
22 |     w_half *= scale
23 |     h_half *= scale
24 | 
25 |     boxes_exp = torch.zeros_like(boxes)
26 |     boxes_exp[:, 0] = x_c - w_half
27 |     boxes_exp[:, 2] = x_c + w_half
28 |     boxes_exp[:, 1] = y_c - h_half
29 |     boxes_exp[:, 3] = y_c + h_half
30 |     return boxes_exp
31 | 
32 | 
33 | def expand_masks(mask, padding):
34 |     # type: (Tensor, int) -> Tuple[Tensor, float]
35 |     M = mask.shape[-1]
36 |     scale = float(M + 2 * padding) / M
37 |     padded_mask = F.pad(mask, (padding,) * 4)
38 |     return padded_mask, scale
39 | 
40 | 
41 | def paste_mask_in_image(mask, box, im_h, im_w):
42 |     # type: (Tensor, Tensor, int, int) -> Tensor
43 |     TO_REMOVE = 1
44 |     w = int(box[2] - box[0] + TO_REMOVE)
45 |     h = int(box[3] - box[1] + TO_REMOVE)
46 |     w = max(w, 1)
47 |     h = max(h, 1)
48 | 
49 |     # Set shape to [batchxCxHxW]
50 |     mask = mask.expand((1, 1, -1, -1))
51 | 
52 |     # Resize mask
53 |     mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
54 |     mask = mask[0][0]
55 | 
56 |     im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
57 |     x_0 = max(box[0], 0)
58 |     x_1 = min(box[2] + 1, im_w)
59 |     y_0 = max(box[1], 0)
60 |     y_1 = min(box[3] + 1, im_h)
61 | 
62 |     im_mask[y_0:y_1, x_0:x_1] = mask[
63 |         (y_0 - box[1]):(y_1 - box[1]), (x_0 - box[0]):(x_1 - box[0])
64 |     ]
65 |     return im_mask
66 | 
67 | 
68 | def paste_masks_in_image(masks, boxes, img_shape, padding=1):
69 |     # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
70 |     masks, scale = expand_masks(masks, padding=padding)
71 |     boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
72 |     im_h, im_w = img_shape
73 |     res = [
74 |         paste_mask_in_image(m[0], b, im_h, im_w)
75 |         for m, b in zip(masks, boxes)
76 |     ]
77 |     if len(res) > 0:
78 |         ret = torch.stack(res, dim=0)[:, None]
79 |     else:
80 |         ret = masks.new_empty((0, 1, im_h, im_w))
81 |     return ret
82 | 


--------------------------------------------------------------------------------
/related/baselines/retinanet/presets.py:
--------------------------------------------------------------------------------
 1 | import retinanet.transforms as T
 2 | 
 3 | 
 4 | class DetectionPresetTrain:
 5 |     def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)):
 6 |         if data_augmentation == 'hflip':
 7 |             self.transforms = T.Compose([
 8 |                 T.RandomHorizontalFlip(p=hflip_prob),
 9 |                 T.ToTensor(),
10 |             ])
11 |         elif data_augmentation == 'ssd':
12 |             self.transforms = T.Compose([
13 |                 T.RandomPhotometricDistort(),
14 |                 T.RandomZoomOut(fill=list(mean)),
15 |                 T.RandomIoUCrop(),
16 |                 T.RandomHorizontalFlip(p=hflip_prob),
17 |                 T.ToTensor(),
18 |             ])
19 |         elif data_augmentation == 'ssdlite':
20 |             self.transforms = T.Compose([
21 |                 T.RandomIoUCrop(),
22 |                 T.RandomHorizontalFlip(p=hflip_prob),
23 |                 T.ToTensor(),
24 |             ])
25 |         else:
26 |             raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
27 | 
28 |     def __call__(self, img, target):
29 |         return self.transforms(img, target)
30 | 
31 | 
32 | class DetectionPresetEval:
33 |     def __init__(self):
34 |         self.transforms = T.ToTensor()
35 | 
36 |     def __call__(self, img, target):
37 |         return self.transforms(img, target)
38 | 
39 | 


--------------------------------------------------------------------------------
/related/baselines/retinanet/train_retinanet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | import numpy as np
  4 | import retinanet.presets as presets
  5 | import time
  6 | from utils.sync_control import *
  7 | from retinanet.model.retinanet import retinanet_from_backbone
  8 | from retinanet.coco_utils import get_openimages, get_coco
  9 | import utils
 10 | 
 11 | def get_dataset_fn(name, shared_config):
 12 |     paths = {
 13 |         "coco": (get_coco, 91, shared_config['coco_root']),
 14 |         "openimages": (get_openimages, 601, None),  # Full openimages dataset
 15 |         "openimages-mlperf": (get_openimages, None),  # L0 classes with more than 1000 samples
 16 |     }
 17 |     return paths[name]
 18 | 
 19 | 
 20 | def get_transform(train, data_augmentation):
 21 |     return presets.DetectionPresetTrain(data_augmentation) if train else presets.DetectionPresetEval()
 22 | 
 23 | 
 24 | def collate_fn(batch):
 25 |     return tuple(zip(*batch))
 26 | 
 27 | 
 28 | def train_wrapper(sync_info, tid: int, model_config, shared_config):
 29 |     device = torch.device("cuda:0")
 30 |     my_stream = torch.cuda.Stream(device=device)
 31 |     seed = int(time.time())
 32 |     torch.manual_seed(seed)
 33 |     np.random.seed(seed=seed)
 34 | 
 35 |     dataset_fn, num_classes, data_path = get_dataset_fn(model_config['dataset_name'], shared_config)
 36 |     data_layout = "channels_last"
 37 |     batch_size = model_config['batch_size']
 38 |     model = retinanet_from_backbone(backbone='resnext50_32x4d',
 39 |                                     num_classes=num_classes,
 40 |                                     image_size=[800, 800],
 41 |                                     data_layout=data_layout,
 42 |                                     pretrained=False,
 43 |                                     trainable_backbone_layers=3)
 44 |     model.to(device)
 45 |     if data_layout == 'channels_last':
 46 |         model = model.to(memory_format=torch.channels_last)
 47 | 
 48 |     params = [p for p in model.parameters() if p.requires_grad]
 49 |     optimizer = torch.optim.Adam(params, lr=0.0001)
 50 | 
 51 |     # GradScaler for AMP
 52 |     scaler = torch.cuda.amp.GradScaler(enabled=model_config['use_amp'])
 53 | 
 54 |     dataset = dataset_fn(name=model_config['dataset_name'],
 55 |                          root=data_path,
 56 |                          image_set="train",
 57 |                          transforms=get_transform(True, 'hflip'))
 58 |     train_sampler = torch.utils.data.RandomSampler(dataset)
 59 |     train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True)
 60 |     data_loader = torch.utils.data.DataLoader(
 61 |         dataset, batch_sampler=train_batch_sampler, num_workers=model_config['num_workers'],
 62 |         pin_memory=False, collate_fn=collate_fn)
 63 | 
 64 |     model.train()
 65 | 
 66 |     num_iterations = model_config['num_iterations']
 67 |     warm_up_iters = model_config['warm_up_iters']
 68 |     if shared_config['use_dummy_data']:
 69 |         train_dataloader_iter = iter(data_loader)
 70 |         images, targets = next(train_dataloader_iter)
 71 |         images = list(image.to(device) for image in images)
 72 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 73 |         virtual_loader = utils.DummyDataLoader(batch=(images, targets))
 74 |     else:
 75 |         virtual_loader = data_loader
 76 | 
 77 |     logging.info(f'retinat is set up with {num_iterations}')
 78 | 
 79 |     for batch_idx, (images, targets) in enumerate(virtual_loader):
 80 |         if batch_idx == warm_up_iters:
 81 |             # finish previous work
 82 |             torch.cuda.synchronize(device)
 83 |             if not sync_info.no_sync_control:
 84 |                 sync_info.barrier.wait()
 85 |             # start timer
 86 |             start_time = time.time()
 87 | 
 88 |         images = list(image.to(device) for image in images)
 89 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 90 |         with ForwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream):
 91 |             with torch.cuda.stream(my_stream):
 92 |                 loss_dict = model(images, targets)
 93 |                 losses = sum(loss for loss in loss_dict.values())
 94 | 
 95 |         with BackwardControl(thread_id=tid, batch_idx=batch_idx, sync_info=sync_info, stream=my_stream):
 96 |             with torch.cuda.stream(my_stream):
 97 |                 scaler.scale(losses).backward()
 98 |                 scaler.step(optimizer)
 99 |                 scaler.update()
100 |                 optimizer.zero_grad()
101 | 
102 |         if batch_idx == num_iterations - 1:
103 |             # reached the last iteration
104 |             break
105 | 
106 |     sync_info.no_sync_control = True
107 |     torch.cuda.synchronize(device)
108 | 
109 |     duration = time.time() - start_time
110 |     logging.info(f'tid {tid} it takes {duration} seconds to train retinanet')
111 |     return duration
112 | 


--------------------------------------------------------------------------------
/related/baselines/run_wrapper.sh:
--------------------------------------------------------------------------------
1 | datestr=$(date '+%H-%M-%S-%Y-%m-%d')
2 | 
3 | python run.py > ${datestr}_output.log 2>&1 &
4 | disown
5 | 


--------------------------------------------------------------------------------
/related/baselines/start_MPS_control_daemon.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0 # Select GPU 0.
2 | 
3 | export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps # Select a location that’s accessible to the given $UID
4 | 
5 | export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log # Select a location that’s accessible to the given $UID
6 | 
7 | nvidia-cuda-mps-control -d # Start the daemon.
8 | 


--------------------------------------------------------------------------------
/related/baselines/stop_MPS_control_daemon.sh:
--------------------------------------------------------------------------------
1 | echo quit | nvidia-cuda-mps-control
2 | 


--------------------------------------------------------------------------------
/related/baselines/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/transformer/__init__.py


--------------------------------------------------------------------------------
/related/baselines/transformer/transformer_consts.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # these configs serve as constants and are not supposed to be tuned; not all fields are used
 3 | base:
 4 |   cuda: true
 5 |   n_layer: 16
 6 |   d_model: 512
 7 |   n_head: 8
 8 |   d_head: 64
 9 |   d_inner: 2048
10 |   dropout: 0.1
11 |   dropatt: 0.0
12 |   optim: jitlamb
13 |   lr: 0.01
14 |   eta_min: 0.001
15 |   roll: true
16 |   warmup_step: 1000
17 |   max_step: 40000
18 |   tgt_len: 192
19 |   mem_len: 192
20 |   init_std: 0.02
21 |   eval_tgt_len: 192
22 |   log_interval: 10
23 |   eval_interval: 5000
24 |   vocab: word
25 |   adaptive: true
26 |   div_val: 1
27 | 
28 | large:
29 |    cuda: true
30 |    n_layer: 18
31 |    d_model: 1024
32 |    n_head: 16
33 |    d_head: 64
34 |    d_inner: 4096
35 |    dropout: 0.2
36 |    dropatt: 0.2
37 |    optim: jitlamb
38 |    lr: 0.01
39 |    eta_min: 0.0001
40 |    roll: true
41 |    warmup_step: 16000
42 |    max_step: 100000
43 |    tgt_len: 384
44 |    mem_len: 384
45 |    init_std: 0.005
46 |    eval_tgt_len: 128
47 |    log_interval: 100
48 |    eval_interval: 5000
49 |    vocab: word
50 |    adaptive: true
51 |    div_val: 4
52 | 


--------------------------------------------------------------------------------
/related/baselines/transformer/transformer_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/transformer/transformer_utils/__init__.py


--------------------------------------------------------------------------------
/related/baselines/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from datetime import datetime
  3 | import json
  4 | import itertools
  5 | import torch
  6 | from numpy import random
  7 | import numpy as np
  8 | import time
  9 | from statistics import mean
 10 | from utils.sync_info import BasicSyncInfo
 11 | def pretty_time():
 12 |     return datetime.now().strftime('%d-%m-%Y-%H-%M-%S')
 13 | 
 14 | 
 15 | def dict2pretty_str(dict_data):
 16 |     return json.dumps(dict_data, indent=4)
 17 | 
 18 | 
 19 | class DummyDataLoader:
 20 |     def __init__(self, batch):
 21 |         self.batch = batch
 22 | 
 23 |     def __iter__(self):
 24 |         return itertools.repeat(self.batch)
 25 | 
 26 | 
 27 | percentile_positions = [50, 90, 95, 99]
 28 | def measure(func, num_requests, num_warm_up_reqs, request_rate, tid, shared_config, stream, sync_info: BasicSyncInfo):
 29 |     """
 30 |     Invoke the func {num_requests} times with first {num_warm_up_reqs} iterations as warm up.
 31 |     Measure how long each invocation takes and calculate statistics (average and percentiles) over them,
 32 |     and finally write all data via {sync_info}.
 33 |     """
 34 |     distribution = shared_config['distribution']
 35 |     if distribution=='trace' and tid==1:
 36 |         # uniform distribution for tid 1
 37 |         distribution = 'uniform'
 38 | 
 39 |     if request_rate == 0:
 40 |         intervals = [0] * num_requests
 41 |     else:
 42 |         scale = 1 / request_rate
 43 |         if distribution == 'trace':
 44 |             with open(shared_config['trace_path']) as f:
 45 |                 intervals = json.load(f)
 46 |             num_requests = len(intervals)
 47 |         elif distribution == 'poisson':
 48 |             intervals = random.exponential(scale=scale, size=(num_requests,))
 49 |         elif distribution == 'uniform':
 50 |             intervals = [scale] * num_requests
 51 |         else:
 52 |             raise NotImplementedError(f'unsupported distribution {distribution}')
 53 | 
 54 | 
 55 |     latency_history = []
 56 | 
 57 |     with torch.no_grad():
 58 |         next_startup = time.time()
 59 |         iteration = 0
 60 |         while True:
 61 |             if time.time() >= next_startup:
 62 |                 if iteration == num_warm_up_reqs:
 63 |                     sync_info.pre_measurement_prep(tid)
 64 |                     entire_inference_start_time = time.time()
 65 |                     # reset next_startup to have clear setup
 66 |                     next_startup = entire_inference_start_time
 67 | 
 68 |                 with torch.cuda.stream(stream):
 69 |                     func()
 70 |                 stream.synchronize()
 71 |                 latency_history.append(1000 * (time.time() - next_startup))
 72 | 
 73 |                 if not sync_info.should_continue_loop(tid, iteration, num_requests):
 74 |                     break
 75 | 
 76 |                 next_startup += intervals[iteration]
 77 | 
 78 |                 duration = next_startup - time.time()
 79 | 
 80 |                 if duration > 0:
 81 |                     time.sleep(duration)
 82 |                 iteration += 1
 83 | 
 84 |     inference_duration = time.time() - entire_inference_start_time
 85 |     sync_info.post_measurement_prep(tid)
 86 |     # discard the first {num_warm_up_reqs} latencies
 87 |     latency_history = latency_history[num_warm_up_reqs:]
 88 |     mean_latency = mean(latency_history)
 89 |     percentiles = np.percentile(latency_history, percentile_positions)
 90 | 
 91 |     # data_to_record = {
 92 |     #     f'latencies{tid}': latency_history,
 93 |     #     f'mean_latency{tid}': mean_latency,
 94 |     #     f'duration{tid}': inference_duration,
 95 |     #     f'iterations{tid}': iteration + 1,
 96 |     # }
 97 |     # record percentiles
 98 |     data_to_record = {}
 99 |     for idx, percentile_pos in enumerate(percentile_positions):
100 |         data_to_record[f'p{percentile_pos}-latency-{tid}'] = percentiles[idx]
101 |         data_to_record[f'throughput-{tid}'] = (iteration-num_warm_up_reqs)/inference_duration
102 |     # write all data to the data file
103 |     sync_info.write_kvs(data_to_record)
104 | 
105 | 
106 | 
107 | def seed_everything(seed: int):
108 |     import random, os
109 |     import numpy as np
110 | 
111 |     random.seed(seed)
112 |     os.environ['PYTHONHASHSEED'] = str(seed)
113 |     np.random.seed(seed)
114 |     torch.manual_seed(seed)
115 |     torch.cuda.manual_seed(seed)
116 | 


--------------------------------------------------------------------------------
/related/baselines/utils/data_manager.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class DataManager:
 5 |     """
 6 |     A class to encapsulate all the logic regarding writing the structured experiment results to a json file.
 7 |     """
 8 | 
 9 |     def __init__(self, experiment_data_json_file):
10 |         self.experiment_data_json_file = experiment_data_json_file
11 |         # init the file
12 |         self._dump_dict({})
13 | 
14 |     def write_kv(self, key, value):
15 |         """
16 |         Write the key-value pair to the json data file.
17 | 
18 |         This method is NOT thread/process-safe, the caller needs a
19 |         synchronization mechanism, e.g. a lock, to ensure at most one writer exists at any time.
20 |         """
21 |         with open(self.experiment_data_json_file, 'r') as f:
22 |             dict_data = json.load(f)
23 | 
24 |         dict_data[key] = value
25 |         self._dump_dict(dict_data)
26 | 
27 |     def write_kvs(self, kv_pairs):
28 |         """
29 |         Write many key-value pairs to the json data file.
30 | 
31 |         This method is NOT thread/process-safe, the caller needs a
32 |         synchronization mechanism, e.g. a lock, to eusure at most one writer exists at any time.
33 |         """
34 |         dict_data = self.read_dict()
35 | 
36 |         dict_data.update(kv_pairs)
37 |         self._dump_dict(dict_data)
38 | 
39 |     def _dump_dict(self, dict_data):
40 |         with open(self.experiment_data_json_file, 'w') as f:
41 |             json.dump(dict_data, f, indent=4)
42 | 
43 |     def read_dict(self):
44 |         with open(self.experiment_data_json_file, 'r') as f:
45 |             dict_data = json.load(f)
46 | 
47 |         return dict_data
48 | 


--------------------------------------------------------------------------------
/related/baselines/utils/sync_control.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from utils.sync_info import BasicSyncInfo
 4 | import torch
 5 | import logging
 6 | 
 7 | 
 8 | # These classes make use of the `with` pattern in Python
 9 | # to centralize tick-tock synchronization logic
10 | 
11 | class ForwardControl:
12 | 
13 |     def __init__(self, thread_id: int, batch_idx: int, sync_info: BasicSyncInfo, stream: torch.cuda.Stream) -> None:
14 |         # we assume thread 0 starts first
15 |         if thread_id not in {0, 1}:
16 |             raise ValueError("thread_id can be either zero or one")
17 | 
18 |         self.sync_info = sync_info
19 |         self.thread_id = thread_id
20 |         self.batch_idx = batch_idx
21 |         self.stream = stream
22 | 
23 |     def __enter__(self) -> None:
24 |         if self.sync_info.no_sync_control:
25 |             return
26 |         logging.debug(f'thread {self.thread_id} starts FORWARD {self.batch_idx}')
27 |         if self.thread_id == 0:
28 |             self.sync_info.eventf1.wait()
29 |             self.sync_info.event_cudaf1.wait(self.stream)
30 |             self.sync_info.eventf1.clear()
31 |         else:
32 |             self.sync_info.eventf0.wait()
33 |             self.sync_info.event_cudaf0.wait(self.stream)
34 |             self.sync_info.eventf0.clear()
35 | 
36 | 
37 |     def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
38 |         if self.sync_info.no_sync_control:
39 |             return exc_type is None
40 |         logging.debug(f'thread {self.thread_id} ends FORWARD {self.batch_idx}')
41 |         if self.thread_id == 0:
42 |             self.sync_info.event_cudaf0.record(self.stream)
43 |             self.sync_info.eventf0.set()
44 |         else:
45 |             self.sync_info.event_cudaf1.record(self.stream)
46 |             self.sync_info.eventf1.set()
47 |         # raise the exception as is if there is any
48 |         return exc_type is None
49 | 
50 | 
51 | class BackwardControl:
52 | 
53 |     def __init__(self, thread_id: int, batch_idx: int, sync_info: BasicSyncInfo, stream: torch.cuda.Stream) -> None:
54 |         # we assume thread 0 starts first
55 |         if thread_id not in {0, 1}:
56 |             raise ValueError("thread_id can be either zero or one")
57 | 
58 |         self.sync_info = sync_info
59 |         self.thread_id = thread_id
60 |         self.batch_idx = batch_idx
61 |         self.stream = stream
62 | 
63 |     def __enter__(self) -> None:
64 |         if self.sync_info.no_sync_control:
65 |             return
66 |         logging.debug(f'thread {self.thread_id} starts BACKWARD {self.batch_idx}')
67 |         if self.thread_id == 0:
68 |             self.sync_info.eventb1.wait()
69 |             self.sync_info.event_cudab1.wait(self.stream)
70 |             self.sync_info.eventb1.clear()
71 |         else:
72 |             self.sync_info.eventb0.wait()
73 |             self.sync_info.event_cudab0.wait(self.stream)
74 |             self.sync_info.eventb0.clear()
75 | 
76 |     def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
77 |         if self.sync_info.no_sync_control:
78 |             return exc_type is None
79 |         logging.debug(f'thread {self.thread_id} ends BACKWARD {self.batch_idx}')
80 |         if self.thread_id == 0:
81 |             self.sync_info.event_cudab0.record(self.stream)
82 |             self.sync_info.eventb0.set()
83 |         else:
84 |             self.sync_info.event_cudab1.record(self.stream)
85 |             self.sync_info.eventb1.set()
86 | 
87 |         # raise the exception as is if there is any
88 |         return exc_type is None
89 | 
90 | 


--------------------------------------------------------------------------------
/related/baselines/utils/sync_info.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import torch
  3 | import multiprocessing
  4 | import time
  5 | from utils.data_manager import DataManager
  6 | 
  7 | class BasicSyncInfo:
  8 |     def __init__(self, data_manager: DataManager, no_sync_control: bool):
  9 |         self.no_sync_control = no_sync_control
 10 |         self.data_manager = data_manager
 11 | 
 12 |     def pre_measurement_prep(self, tid):
 13 |         return
 14 | 
 15 |     def post_measurement_prep(self, tid):
 16 |         return
 17 | 
 18 |     def write_kv(self, key, value):
 19 |         self.data_manager.write_kv(key, value)
 20 | 
 21 |     def write_kvs(self, kv_pairs):
 22 |         self.data_manager.write_kvs(kv_pairs)
 23 | 
 24 |     def should_continue_loop(self, tid: int, current_iteration: int, total_iterations: int):
 25 |         return current_iteration < total_iterations - 1
 26 | 
 27 | 
 28 | class TickTockSyncInfo(BasicSyncInfo):
 29 | 
 30 |     def __init__(self, data_manager: DataManager) -> None:
 31 |         super().__init__(data_manager, no_sync_control=False)
 32 |         self.barrier = threading.Barrier(2)
 33 |         self.lock = threading.Lock()
 34 |         # thread events - for thread synchronization
 35 |         eventf0 = threading.Event()
 36 |         eventb0 = threading.Event()
 37 | 
 38 |         eventf1 = threading.Event()
 39 |         eventb1 = threading.Event()
 40 | 
 41 |         event_cudaf0 = torch.cuda.Event()
 42 |         event_cudab0 = torch.cuda.Event()
 43 | 
 44 |         event_cudaf1 = torch.cuda.Event()
 45 |         event_cudab1 = torch.cuda.Event()
 46 | 
 47 |         eventf1.set()  # t0 starts
 48 |         eventb1.set()
 49 | 
 50 |         self.eventf0 = eventf0
 51 |         self.eventf1 = eventf1
 52 |         self.eventb0 = eventb0
 53 |         self.eventb1 = eventb1
 54 |         self.event_cudaf0 = event_cudaf0
 55 |         self.event_cudab0 = event_cudab0
 56 |         self.event_cudaf1 = event_cudaf1
 57 |         self.event_cudab1 = event_cudab1
 58 |         self.start_time = None
 59 | 
 60 |     def pre_measurement_prep(self, tid):
 61 |         self.barrier.wait()
 62 | 
 63 |         if tid == 0:
 64 |             self.start_time = time.time()
 65 | 
 66 |     def post_measurement_prep(self, tid):
 67 |         self.no_sync_control = True
 68 |         # the other thread might already enter next foward control
 69 |         # before setting `no_sync_control`; set the flags to make it continue
 70 |         self.eventf0.set()
 71 |         self.eventf1.set()
 72 |         self.barrier.wait()
 73 |         if tid == 0:
 74 |             duration = time.time() - self.start_time
 75 |             self.write_kv('duration', duration)
 76 | 
 77 |     def write_kv(self, key, value):
 78 |         with self.lock:
 79 |             super().write_kv(key, value)
 80 | 
 81 |     def write_kvs(self, kv_pairs):
 82 |         with self.lock:
 83 |             super().write_kvs(kv_pairs)
 84 | 
 85 | 
 86 | class ConcurrentSyncInfo(BasicSyncInfo):
 87 |     def __init__(self, data_manager: DataManager, num_clients, isolation_level):
 88 |         super().__init__(data_manager, no_sync_control=True)
 89 |         self.isolation_level = isolation_level
 90 |         assert isolation_level in ['thread', 'process']
 91 |         if isolation_level == 'thread':
 92 |             self.barrier = threading.Barrier(num_clients)
 93 |             self.lock = threading.Lock()
 94 |             self.stop_signal = threading.Event()
 95 |         else:
 96 |             self.barrier = multiprocessing.Barrier(num_clients)
 97 |             self.lock = multiprocessing.Lock()
 98 |             self.stop_signal = multiprocessing.Event()
 99 |         self.start_time = None
100 | 
101 |     def pre_measurement_prep(self, tid):
102 |         self.barrier.wait()
103 |         if tid == 0:
104 |             self.start_time = time.time()
105 | 
106 |     def post_measurement_prep(self, tid):
107 |         # let the other part break out of the loop
108 |         self.stop_signal.set()
109 |         self.barrier.wait()
110 |         if tid == 0:
111 |             duration = time.time() - self.start_time
112 |             self.write_kv("duration", duration)
113 | 
114 |     def write_kv(self, key, value):
115 |         with self.lock:
116 |             super().write_kv(key, value)
117 | 
118 |     def write_kvs(self, kv_pairs):
119 |         with self.lock:
120 |             super().write_kvs(kv_pairs)
121 | 
122 |     def should_continue_loop(self, tid: int, current_iteration: int, total_iterations: int):
123 |         if tid == 0:
124 |             return super().should_continue_loop(tid, current_iteration, total_iterations)
125 |         else:
126 |             return not self.stop_signal.is_set()
127 | 


--------------------------------------------------------------------------------
/related/baselines/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eth-easl/orion/aded65dda4b5e6104133bec31f54f170315df217/related/baselines/vision/__init__.py


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='orion',
 5 |     packages=find_packages(),
 6 |     version='0.1.0',
 7 |     description='Orion library',
 8 |     author='EASL',
 9 | )
10 | 


--------------------------------------------------------------------------------
/setup/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | WORKDIR /root
 3 | 
 4 | RUN rm /etc/apt/sources.list.d/cuda.list
 5 | RUN rm /etc/apt/sources.list.d/nvidia-ml.list
 6 | RUN apt-get -y update
 7 | RUN apt install -y software-properties-common
 8 | RUN apt-get install -y vim wget git
 9 | RUN apt install -y libjpeg-dev zlib1g-dev
10 | 
11 | 
12 | RUN apt -y install build-essential libssl-dev
13 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.19.6/cmake-3.19.6.tar.gz
14 | RUN tar -zxvf cmake-3.19.6.tar.gz
15 | RUN cd cmake-3.19.6 && ./bootstrap && make && make install && cp bin/cmake /bin/  && cd ..
16 | 
17 | RUN apt update -y
18 | RUN apt install software-properties-common -y
19 | RUN add-apt-repository ppa:deadsnakes/ppa
20 | RUN apt install python3.8-dev -y
21 | 
22 | RUN apt-get -y install python3-pip
23 | RUN python3.8 -m pip install --upgrade pip
24 | RUN python3.8 -m pip install pyyaml typing_extensions
25 | RUN python3.8 -m pip install Pillow
26 | RUN python3.8 -m pip install numpy
27 | 
28 | RUN git clone --recursive https://github.com/pytorch/pytorch
29 | COPY orion-torch-changes.patch /root/pytorch/
30 | RUN cd pytorch && git reset --hard 67ece03c8cd632cce9523cd96efde6f2d1cc8121 && git apply orion-torch-changes.patch && git submodule sync && git submodule update --init --recursive --jobs 0 && python3.8 setup.py develop && cd ..
31 | 
32 | RUN git clone https://github.com/pytorch/vision.git
33 | RUN cd vision && git reset --hard da3794e90c7cf69348f5446471926729c55f243e && python3.8 setup.py develop && cd ..
34 | 
35 | RUN echo "alias python=python3.8" >> /root/.bashrc
36 | SHELL ["source" , "/root/.bashrc"]
37 | SHELL ["/bin/sh", "-c"]
38 | 
39 | 
40 | RUN git clone https://github.com/NVIDIA/DeepLearningExamples.git
41 | COPY nvidia_deeplearning_changes.patch /root/DeepLearningExamples/
42 | 
43 | RUN cd DeepLearningExamples/ && git reset --hard 6610c05c330b887744993fca30532cbb9561cbde && git apply nvidia_deeplearning_changes.patch
44 | RUN cd /root/DeepLearningExamples/PyTorch/LanguageModeling/BERT && export BERT_PREP_WORKING_DIR=/root/DeepLearningExamples/PyTorch/LanguageModeling/BERT && python3.8 -m pip install -r requirements.txt && python3.8 -m pip install wget  && bash data/create_datasets_from_start.sh
45 | 
46 | RUN cd /root/DeepLearningExamples/PyTorch/LanguageModeling/Transformer-XL && pip install -r requirements.txt &&  bash getdata.sh
47 | 


--------------------------------------------------------------------------------
/setup/README.md:
--------------------------------------------------------------------------------
 1 | We have set up a docker image: [fotstrt/orion-ae](https://hub.docker.com/repository/docker/fotstrt/orion-ae/general) with all packages pre-installed.
 2 | This directory contains the Dockerfile used to create the image.
 3 | 
 4 | If the user does not want to use this image, then please follow these steps:
 5 | 
 6 | * Install CUDA 10.2 and CUDNN 7.6.5 (or use a base image containing both, such as: `nvcr.io/nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04` )
 7 | * Run `install.sh`
 8 | * Install PyTorch from source:
 9 |     * `git clone --recursive https://github.com/pytorch/pytorch`
10 |     * `cd pytorch`
11 |     * `git reset --hard 67ece03c8cd632cce9523cd96efde6f2d1cc8121`
12 |     * Apply a patch of changes for Orion: `git apply orion-torch-changes.patch`
13 |     * `git submodule sync`
14 |     * `git submodule update --init --recursive --jobs 0`
15 |     * `python3.8 setup.py develop`
16 | 
17 | * Install Torchvision from source:
18 |     * `git clone https://github.com/pytorch/vision.git`
19 |     * `cd vision`
20 |     * `git reset --hard da3794e90c7cf69348f5446471926729c55f243e`
21 |     * `python3.8 setup.py develop`
22 | 


--------------------------------------------------------------------------------
/setup/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt-get update
 4 | sudo apt install software-properties-common
 5 | sudo apt-get install vim wget git
 6 | sudo apt install libjpeg-dev zlib1g-dev
 7 | 
 8 | # cmake
 9 | 
10 | sudo apt install build-essential libssl-dev
11 | wget https://github.com/Kitware/CMake/releases/download/v3.19.6/cmake-3.19.6.tar.gz
12 | tar -zxvf cmake-3.19.6.tar.gz
13 | cd cmake-3.19.6
14 | ./bootstrap
15 | make
16 | sudo make install
17 | cp bin/cmake /bin/
18 | cd ..
19 | 
20 | # python
21 | 
22 | sudo apt update
23 | sudo apt install software-properties-common
24 | sudo add-apt-repository ppa:deadsnakes/ppa
25 | sudo apt install python3.8-dev
26 | 
27 | # pip
28 | 
29 | sudo apt-get -y install python3-pip
30 | python3.8 -m pip install --upgrade pip
31 | python3.8 -m pip install pyyaml typing_extensions
32 | python3.8 -m pip install Pillow
33 | python3.8 -m pip install numpy


--------------------------------------------------------------------------------
/src/cuda_capture/Makefile:
--------------------------------------------------------------------------------
 1 | CC=g++
 2 | CUDAINCLUDE=/usr/local/cuda-10.2/include/
 3 | CUDALIB=/usr/local/cuda-10.2/lib64
 4 | 
 5 | libinttemp.so: utils_interc.cpp intercept_cudnn.cpp intercept_cublas.cpp intercept_temp.cpp
 6 | 	$(CC) -O3 -fPIC -shared utils_interc.cpp intercept_cudnn.cpp intercept_cublas.cpp intercept_temp.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread -o libinttemp.so
 7 | 
 8 | all:
 9 | 	make libinttemp.so
10 | 
11 | clean:
12 | 	rm -rf *.o libinttemp.so
13 | 


--------------------------------------------------------------------------------
/src/cuda_capture/README.md:
--------------------------------------------------------------------------------
 1 | ### Basic library to capture CUDA calls
 2 | 
 3 | This captures only cudaLaunchKernel and cudaMalloc for now. It is also applicable for PyTorch programs.
 4 | 
 5 | ### Compile
 6 | 
 7 | make all
 8 | 
 9 | ### Run
10 | 
11 | LD_PRELOAD="<full path to the library" <my_program>
12 | 
13 | 


--------------------------------------------------------------------------------
/src/cuda_capture/intercept_cublas.cpp:
--------------------------------------------------------------------------------
  1 | /* Intercepts and overwrites CUBLAS calls */
  2 | 
  3 | #include "intercept_temp.h"
  4 | 
  5 | cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) {
  6 | 
  7 | 	int idx = get_idx();
  8 | 	assert (idx >= 0);
  9 | 	cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
 10 | 
 11 | 	cublasSgemm_record blassgemm_record = {
 12 | 		handle,
 13 | 		transa,
 14 | 		transb,
 15 | 		m,
 16 | 		n,
 17 | 		k,
 18 | 		alpha,
 19 | 		A,
 20 | 		lda,
 21 | 		B,
 22 | 		ldb,
 23 | 		beta,
 24 | 		C,
 25 | 		ldc
 26 | 	};
 27 | 
 28 | 	union func_data new_func_data;
 29 | 	new_func_data.cublasSgemmRecord = blassgemm_record;
 30 | 	func_record new_record = {CUBLAS_SGEMM_RECORD, new_func_data};
 31 | 
 32 | 	if (idx < *num_total_clients) {
 33 | 
 34 | 		pthread_mutex_lock(mutexes[idx]);
 35 | 		DEBUG_PRINT("[INTERCEPTER-CATCH]-[%d] Caught cublasSgemm_v2, handle is %p, index %d, m is %d, n is %d, k is %d\n", func_indexes[idx], handle, idx, m, n, k);
 36 | 		kqueues[idx]->push(new_record);
 37 | 		func_indexes[idx] += 1;
 38 | 		pthread_mutex_unlock(mutexes[idx]);
 39 | 
 40 | 		block(idx,  mutexes, kqueues);
 41 | 	}
 42 | 	else {
 43 | 
 44 | 		if (cublas_sgemm_func==NULL) {
 45 | 			*(void **)(&cublas_sgemm_func) = dlsym(RTLD_NEXT, "cublasSgemm_v2");
 46 | 			assert(cublas_sgemm_func != NULL);
 47 | 		}
 48 | 		status = (*cublas_sgemm_func)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 49 | 		assert (status == CUBLAS_STATUS_SUCCESS);
 50 | 		DEBUG_PRINT("CUBLAS status is %d\n", status);
 51 | 
 52 | 	}
 53 | 
 54 | 	return status;
 55 | 
 56 | }
 57 | 
 58 | 
 59 | 
 60 | cublasStatus_t cublasSgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) {
 61 | 
 62 | 	int idx = get_idx();
 63 | 	assert (idx >= 0);
 64 | 	cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
 65 | 
 66 | 	cublasSgemm_record blassgemm_record = {
 67 | 		handle,
 68 | 		transa,
 69 | 		transb,
 70 | 		m,
 71 | 		n,
 72 | 		k,
 73 | 		alpha,
 74 | 		A,
 75 | 		lda,
 76 | 		B,
 77 | 		ldb,
 78 | 		beta,
 79 | 		C,
 80 | 		ldc
 81 | 	};
 82 | 
 83 | 	union func_data new_func_data;
 84 | 	new_func_data.cublasSgemmRecord = blassgemm_record;
 85 | 	func_record new_record = {CUBLAS_SGEMM_RECORD, new_func_data};
 86 | 
 87 | 	if (idx < *num_total_clients) {
 88 | 
 89 | 		pthread_mutex_lock(mutexes[idx]);
 90 | 		DEBUG_PRINT("[INTERCEPTER-CATCH]-[%d] Caught cublasSgemm, handle is %p, index %d, m is %d, n is %d, k is %d\n", func_indexes[idx], handle, idx, m, n, k);
 91 | 		kqueues[idx]->push(new_record);
 92 | 		func_indexes[idx] += 1;
 93 | 		pthread_mutex_unlock(mutexes[idx]);
 94 | 
 95 | 		block(idx,  mutexes, kqueues);
 96 | 	}
 97 | 	else {
 98 | 
 99 | 		if (cublas_sgemm_func==NULL) {
100 | 			*(void **)(&cublas_sgemm_func) = dlsym(RTLD_NEXT, "cublasSgemm_v2");
101 | 			assert(cublas_sgemm_func != NULL);
102 | 		}
103 | 		status = (*cublas_sgemm_func)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
104 | 		assert (status == CUBLAS_STATUS_SUCCESS);
105 | 		DEBUG_PRINT("CUBLAS status is %d\n", status);
106 | 
107 | 	}
108 | 
109 | 	return status;
110 | 
111 | }
112 | 
113 | 
114 | cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, long long int strideA, const float *B, int ldb, long long int strideB, const float *beta, float *C, int ldc, long long int strideC, int batchCount) {
115 | 
116 | 	int idx = get_idx();
117 | 	assert (idx >= 0);
118 | 	cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
119 | 
120 | 	cublasSgemmStridedBatched_record record = {
121 | 		handle,
122 | 		transa,
123 | 		transb,
124 | 		m,
125 | 		n,
126 | 		k,
127 | 		alpha,
128 | 		A,
129 | 		lda,
130 | 		strideA,
131 | 		B,
132 | 		ldb,
133 | 		strideB,
134 | 		beta,
135 | 		C,
136 | 		ldc,
137 | 		strideC,
138 | 		batchCount
139 | 	};
140 | 
141 | 	union func_data new_func_data;
142 | 	new_func_data.cublasSgemmStridedRecord = record;
143 | 	func_record new_record = {CUBLAS_SGEMM_STRIDED_RECORD, new_func_data};
144 | 
145 | 	if (idx < *num_total_clients) {
146 | 
147 | 		pthread_mutex_lock(mutexes[idx]);
148 | 		DEBUG_PRINT("[INTERCEPTER-CATCH]-[%d] Caught cublasSgemmStridedBatched, handle is %p\n", func_indexes[idx], handle);
149 | 		kqueues[idx]->push(new_record);
150 | 		func_indexes[idx] += 1;
151 | 		pthread_mutex_unlock(mutexes[idx]);
152 | 
153 | 		block(idx,  mutexes, kqueues);
154 | 
155 | 	}
156 | 	else {
157 | 
158 | 		if (cublas_sgemm_strided_func==NULL) {
159 | 			*(void **)(&cublas_sgemm_strided_func) = dlsym(RTLD_NEXT, "cublasSgemmStridedBatched");
160 | 			assert(cublas_sgemm_strided_func != NULL);
161 | 		}
162 | 
163 | 		status = (*cublas_sgemm_strided_func)(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
164 | 		assert (status == CUBLAS_STATUS_SUCCESS);
165 | 		DEBUG_PRINT("CUBLAS status is %d\n", status);
166 | 
167 | 	}
168 | 
169 | 	return status;
170 | }
171 | 
172 | cublasStatus_t cublasDestroy(cublasHandle_t handle) {
173 | 
174 | 	DEBUG_PRINT("Caught a cublasDestroy! Do nothing!\n");
175 | 	return CUBLAS_STATUS_SUCCESS;
176 | }
177 | 


--------------------------------------------------------------------------------
/src/cuda_capture/utils_interc.cpp:
--------------------------------------------------------------------------------
 1 | #include "intercept_temp.h"
 2 | 
 3 | int get_idx() {
 4 | 
 5 | 	// Each client thread has a unique ID in the scheduler.
 6 | 	// Based on the thread id that is captured, find the proper index
 7 | 
 8 | 	#ifdef SYS_gettid
 9 | 		pid_t tid = syscall(SYS_gettid);
10 | 	#else
11 | 	#error "SYS_gettid unavailable on this system"
12 | 	#endif
13 | 
14 | 
15 | 		int idx = -1;
16 | 		int clients = *num_total_clients;
17 | 		int num_tids = 2*clients+1;
18 | 
19 | 		for (int i=0; i<num_tids; i++) {
20 | 			if (tid == thread_ids[i]) {
21 | 				idx = i%(clients+1);
22 | 				break;
23 | 			}
24 | 		}
25 | 		if (idx == -1) {
26 | 			// set threads for backward pass
27 | 			// In PyTorch training, a different thread is used for the backward pass
28 | 			for (int i=clients+1; i<num_tids; i++) {
29 | 				if (thread_ids[i] == 0) {
30 | 					thread_ids[i] = tid;
31 | 					idx = i%(clients+1);
32 | 				}
33 | 			}
34 | 		}
35 | 
36 | 		// set per-thread affinity
37 | 		int offset = 1;
38 | 		if (clients==2) {
39 | 			// for compatibility with AE experiments
40 | 			offset = 4;
41 | 		}
42 | 		if (idx > -1 && !affinity_set[idx]) {
43 | 			cpu_set_t  mask;
44 | 			CPU_ZERO(&mask);
45 | 			CPU_SET(idx+offset, &mask);
46 | 			int result = sched_setaffinity(0, sizeof(mask), &mask);
47 | 			assert (result==0);
48 | 			affinity_set[idx] = true;
49 | 		}
50 | 		return idx;
51 | }
52 | 
53 | void block(int idx, pthread_mutex_t** mutexes, queue<func_record>** kqueues) {
54 | 
55 | 	// make sure all pending operations have completed
56 | 	while (1) {
57 | 		pthread_mutex_lock(mutexes[idx]);
58 | 		volatile int sz = kqueues[idx]->size();
59 | 		pthread_mutex_unlock(mutexes[idx]);
60 | 		if (sz==0)
61 | 			break;
62 | 	}
63 | 
64 | }


--------------------------------------------------------------------------------
/src/scheduler/Makefile:
--------------------------------------------------------------------------------
 1 | CC=g++
 2 | NVCC=/usr/local/cuda-10.2/bin/nvcc
 3 | CFLAGS=-O3
 4 | CUDAINCLUDE=/usr/local/cuda-10.2/include/
 5 | CUDALIB=/usr/local/cuda-10.2/lib64
 6 | 
 7 | utils_sched.o: utils_sched.cpp
 8 | 	$(NVCC) $(CFLAGS) -Xcompiler  -fPIC -x cu -shared -c utils_sched.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread
 9 | 
10 | scheduler.o: scheduler.cpp
11 | 	$(NVCC) $(CFLAGS) -Xcompiler  -fPIC  -x cu -shared -c scheduler.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread
12 | 
13 | scheduler_eval.o: scheduler_eval.cpp
14 | 	$(NVCC) $(CFLAGS) -Xcompiler  -fPIC  -x cu -shared -c scheduler_eval.cpp -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread
15 | 
16 | scheduler_eval.so: scheduler_eval.o utils_sched.o
17 | 	 $(CC) $(CFLAGS) -fPIC -shared utils_sched.o scheduler_eval.o -I$(CUDAINCLUDE) -L$(CUDALIB) -lcudart -lpthread -o scheduler_eval.so
18 | 
19 | all:
20 | 	make scheduler_eval.so
21 | 
22 | clean:
23 | 	rm -rf *.o *.so
24 | 


--------------------------------------------------------------------------------
/src/scheduler/scheduler.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | //#include <cublas.h>
 3 | #include <dlfcn.h>
 4 | #include <queue>
 5 | #include <vector>
 6 | #include <pthread.h>
 7 | #include <syscall.h>
 8 | #include <pwd.h>
 9 | #include <iostream>
10 | #include <string.h>
11 | #include <fstream>
12 | 
13 | #include "utils_sched.h"
14 | 
15 | //void* sched_func(void* args);
16 | 
17 | class Scheduler {
18 | 
19 | 	public:
20 | 		void profile_prep(queue<func_record>** qbuffers, int num_clients, bool reef);
21 | 		void profile_reset(int num_clients);
22 | 		void* busy_wait_profile(int num_clients, int iter, bool warmup, int warmup_iters, bool reef, bool seq,  int depth, int hp_limit, int update_start);
23 | 		void schedule_reef(vector<func_record*> frecords, int num_clients, int depth);
24 | 		int schedule_sequential(vector<func_record*> frecords, int num_clients, int start);
25 | 
26 | };
27 | 
28 | //void* sched_func(void* sched);
29 | //Scheduler* sched_init();
30 | 


--------------------------------------------------------------------------------
/src/scheduler_frontend.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | from ctypes import *
  3 | import torch
  4 | import numpy as np
  5 | import os
  6 | import time
  7 | 
  8 | class PyScheduler:
  9 | 
 10 |     def __init__(self, sched_lib, num_clients):
 11 | 
 12 |         torch.cuda.set_device(0)
 13 |         self._scheduler = sched_lib.sched_init()
 14 |         self._sched_lib = sched_lib
 15 |         self._num_clients = num_clients
 16 | 
 17 |     def run_scheduler(
 18 |         self,
 19 |         barriers,
 20 |         tids,
 21 |         model_names,
 22 |         kernel_files,
 23 |         additional_kernel_files,
 24 |         num_kernels,
 25 |         additional_num_kernels,
 26 |         num_iters,
 27 |         profile,
 28 |         run_eval,
 29 |         reef,
 30 |         sequential,
 31 |         reef_depth,
 32 |         hp_limit,
 33 |         update_start,
 34 |         train
 35 |     ):
 36 | 
 37 |         print(f"REEF IS {reef}, SEQUENTIAL IS {sequential}")
 38 | 
 39 |         model_names_ctypes = [x.encode('utf-8') for x in model_names]
 40 |         lib_names = [x.encode('utf-8') for x in kernel_files]
 41 | 
 42 |         # convert
 43 |         IntAr = c_int * self._num_clients
 44 |         tids_ar = IntAr(*tids)
 45 |         num_kernels_ar = IntAr(*num_kernels)
 46 |         num_iters_ar = IntAr(*num_iters)
 47 | 
 48 |         CharAr = c_char_p * self._num_clients
 49 |         model_names_ctypes_ar = CharAr(*model_names_ctypes)
 50 |         lib_names_ar = CharAr(*lib_names)
 51 | 
 52 |         BoolAr = c_bool * self._num_clients
 53 |         train_ar = BoolAr(*train)
 54 | 
 55 |         print(train)
 56 |         self._sched_lib.argtypes = [c_void_p, c_int, POINTER(c_int), POINTER(c_char_p), POINTER(c_char_p), POINTER(c_int), POINTER(c_bool)]
 57 | 
 58 |         print(model_names, lib_names, tids)
 59 | 
 60 |         self._sched_lib.setup(self._scheduler, self._num_clients, tids_ar, model_names_ctypes_ar, lib_names_ar, num_kernels_ar, num_iters_ar, train_ar, reef)
 61 | 
 62 |         num_clients = len(tids)
 63 |         print(f"Num clients is {num_clients}")
 64 | 
 65 |         print(f"before starting, profile is {profile}")
 66 |         timings=[]
 67 | 
 68 |         if run_eval:
 69 |             if profile:
 70 |                 barriers[0].wait()
 71 |                 # run once to warm-up and setup
 72 |                 self._sched_lib.schedule(self._scheduler, num_clients, True, 0, True, 1, reef, sequential, reef_depth, hp_limit, update_start)
 73 |                 torch.cuda.synchronize()
 74 | 
 75 |                 for j in range(num_clients):
 76 |                     if (additional_kernel_files[j] is not None):
 77 |                         new_kernel_file = additional_kernel_files[j].encode('utf-8')
 78 |                         self._sched_lib.setup_change(self._scheduler, j, new_kernel_file, additional_num_kernels[j])
 79 | 
 80 |                 print("wait here")
 81 |                 barriers[0].wait() #FIXME
 82 |                 print("done!")
 83 | 
 84 |                 # warmup
 85 |                 self._sched_lib.schedule(self._scheduler, num_clients, True, 0, True, 10, reef, sequential, reef_depth, hp_limit, update_start)
 86 |                 torch.cuda.synchronize()
 87 |                 barriers[0].wait()
 88 | 
 89 |                 start = time.time()
 90 |                 print("call schedule")
 91 |                 self._sched_lib.schedule(self._scheduler, num_clients, True, 0, False, 0, reef, sequential, reef_depth, hp_limit, update_start)
 92 |                 barriers[0].wait()
 93 |                 torch.cuda.synchronize()
 94 |                 print(f"Total time is {time.time()-start}")
 95 | 
 96 |         else:
 97 |             for i in range(num_iters[0]):
 98 | 
 99 |                 print(f"Start {i} iteration")
100 |                 if profile:
101 |                     barriers[0].wait()
102 |                     # needed for backward
103 |                     if (i==1):
104 |                         for j in range(num_clients):
105 |                             if (additional_kernel_files[j] is not None):
106 |                                 new_kernel_file = additional_kernel_files[j].encode('utf-8')
107 |                                 self._sched_lib.setup_change(self._scheduler, j, new_kernel_file, additional_num_kernels[j])
108 |                         barriers[0].wait() #FIXME
109 | 
110 |                     start = time.time()
111 |                     print("call schedule")
112 |                     self._sched_lib.schedule(self._scheduler, num_clients, True, i)
113 |                     torch.cuda.synchronize()
114 | 
115 |                 # or this
116 |                 else:
117 |                     start = time.time()
118 |                     for j in range(num_clients):
119 |                         barriers[j].wait()
120 |                         self._sched_lib.schedule_one(self._scheduler, j)
121 |                         torch.cuda.synchronize()
122 | 
123 |                 total_time = time.time()-start
124 |                 print(f"Iteration {i} took {total_time} sec")
125 |                 timings.append(total_time)
126 |             timings = timings[3:]
127 |             print(f"Avg is {np.median(np.asarray(timings))}, Min is {min(timings)} sec")
128 | 


--------------------------------------------------------------------------------
/src/system_utils.h:
--------------------------------------------------------------------------------
1 | #include <bits/stdc++.h>
2 | 
3 | #ifdef DEBUG
4 | # define DEBUG_PRINT(...) fprintf(stdout, __VA_ARGS__)
5 | #else
6 | # define DEBUG_PRINT(...) do {} while (0)
7 | #endif
8 | 
9 | using namespace std;


--------------------------------------------------------------------------------