├── CMakeLists.txt ├── LICENSE ├── README.md ├── app ├── CMakeLists.txt └── client.cpp ├── experiments ├── fig2 │ ├── Makefile │ ├── README.md │ ├── hol.cu │ ├── plot.py │ ├── plot_qlen.py │ ├── plot_ts.py │ └── run.py └── triton │ ├── README.md │ └── server.sh ├── include └── llis │ ├── client │ ├── client.h │ ├── io_shm_entry.h │ ├── job_instance_ref.h │ ├── job_ref.h │ └── profiler_client.h │ ├── ipc │ ├── atomic_lock.h │ ├── atomic_wrapper.h │ ├── defs.h │ ├── name_format.h │ ├── shm_channel.h │ ├── shm_channel_impl.h │ ├── shm_primitive_channel.h │ ├── shm_primitive_channel_impl.h │ ├── threadfence_wrapper.h │ └── unix_datagram_socket.h │ ├── job │ ├── context.h │ ├── coroutine_job.h │ ├── finished_block_notifier.h │ ├── instrument.h │ ├── instrument_info.h │ ├── job.h │ └── utils.h │ ├── server │ ├── client_connection.h │ ├── gpu_resources.h │ ├── profiler.h │ ├── registered_job.h │ ├── scheduler.h │ ├── scheduler_fifo.h │ ├── scheduler_fifo2.h │ ├── scheduler_full.h │ ├── scheduler_full2.h │ ├── scheduler_full3.h │ ├── server.h │ └── sm_resources.h │ └── utils │ ├── align.h │ ├── error.h │ ├── gpu.h │ ├── logging.hh │ ├── ops.h │ ├── path.h │ └── time.hh ├── jobs ├── CMakeLists.txt ├── cnn │ ├── CMakeLists.txt │ ├── layer.cu │ ├── layer.h │ ├── main.cu │ └── mnist.h ├── dummy_10 │ ├── CMakeLists.txt │ └── dummy.cu ├── dummy_11 │ ├── CMakeLists.txt │ └── dummy.cu ├── dummy_20 │ ├── CMakeLists.txt │ └── dummy.cu ├── dummy_21 │ ├── CMakeLists.txt │ └── dummy.cu ├── dummy_long │ ├── CMakeLists.txt │ └── dummy_long.cu ├── dummy_short │ ├── CMakeLists.txt │ └── dummy_short.cu ├── helloworld │ ├── CMakeLists.txt │ └── helloworld.cu ├── helloworld_coroutine │ ├── CMakeLists.txt │ └── helloworld_coroutine.cu ├── run_forever │ ├── CMakeLists.txt │ └── run_forever.cu ├── tvm_arcfaceresnet100 │ ├── CMakeLists.txt │ └── tvm_arcfaceresnet100.cpp ├── tvm_densenet121 │ ├── CMakeLists.txt │ └── tvm_densenet121.cpp ├── tvm_googlenet │ ├── CMakeLists.txt │ └── tvm_googlenet.cpp ├── tvm_inception_v3 │ ├── CMakeLists.txt │ └── tvm_inception_v3.cpp ├── tvm_mnist │ ├── CMakeLists.txt │ └── tvm_mnist.cpp ├── tvm_mobilenet │ ├── CMakeLists.txt │ └── tvm_mobilenet.cpp ├── tvm_resnet18 │ ├── CMakeLists.txt │ └── tvm_resnet18.cpp ├── tvm_resnet34 │ ├── CMakeLists.txt │ └── tvm_resnet34.cpp ├── tvm_resnet50 │ ├── CMakeLists.txt │ └── tvm_resnet50.cpp ├── tvm_squeezenet1_1 │ ├── CMakeLists.txt │ └── tvm_squeezenet1_1.cpp ├── tvm_ultraface320 │ ├── CMakeLists.txt │ └── tvm_ultraface320.cpp └── vec_add_coroutine │ ├── CMakeLists.txt │ └── vec_add_coroutine.cu ├── sosp23_artifact ├── README.md ├── gen_data_fig11_cuda.sh ├── gen_data_fig11_paella.sh ├── gen_data_fig11_triton.sh ├── gen_data_fig12_cuda.sh ├── gen_data_fig12_mps.sh ├── gen_data_fig12_paella.sh ├── gen_data_fig13.sh ├── plot_fig11.sh ├── plot_fig12.sh ├── plot_fig13.sh ├── setup │ ├── README.md │ ├── build_triton_docker.sh │ ├── dso_to_tf.py │ ├── install_dependencies.sh │ ├── install_llis_tvm.sh │ ├── install_triton_client.sh │ ├── onnx2tvm.py │ ├── onnx2tvm_all.sh │ ├── reset_all.sh │ ├── reset_llis_tvm.sh │ └── triton_docker │ │ ├── Dockerfile │ │ ├── build_tvm_tf.sh │ │ ├── convert_tvm_to_tf.sh │ │ ├── run_on_tf_docker.sh │ │ └── setup.sh ├── tf_models_config │ ├── densenet-9 │ │ └── config.pbtxt │ ├── googlenet-9 │ │ └── config.pbtxt │ ├── inception_v3 │ │ └── config.pbtxt │ ├── mobilenetv2-7 │ │ └── config.pbtxt │ ├── resnet18-v2-7 │ │ └── config.pbtxt │ ├── resnet34-v2-7 │ │ └── config.pbtxt │ ├── resnet50-v2-7 │ │ └── config.pbtxt │ └── squeezenet1.1-7 │ │ └── config.pbtxt ├── tools │ ├── merge_mps_results.py │ ├── parse_input_kelvin.py │ ├── parse_triton.py │ ├── plot_latency_fairness_threshold.py │ └── plot_latency_throughput_subplots.py ├── triton_server_launch.sh └── tvm_models_dim │ ├── densenet-9-cuda-pack.so.dim │ ├── googlenet-9-cuda-pack.so.dim │ ├── inception_v3-cuda-pack.so.dim │ ├── mnist-8-cuda-pack.so.dim │ ├── mobilenetv2-7-cuda-pack.so.dim │ ├── resnet18-v2-7-cuda-pack.so.dim │ ├── resnet34-v2-7-cuda-pack.so.dim │ ├── resnet50-v2-7-cuda-pack.so.dim │ └── squeezenet1.1-7-cuda-pack.so.dim ├── src ├── CMakeLists.txt ├── client │ ├── CMakeLists.txt │ ├── client.cpp │ ├── job_instance_ref.cpp │ ├── job_ref.cpp │ └── profiler_client.cpp ├── ipc │ ├── CMakeLists.txt │ ├── name_format.cpp │ ├── shm_channel.cu │ ├── shm_primitive_channel.cu │ └── unix_datagram_socket.cpp ├── job │ ├── CMakeLists.txt │ ├── context.cpp │ ├── finished_block_notifier.cu │ └── utils.cu └── server │ ├── CMakeLists.txt │ ├── client_connection.cpp │ ├── gpu_resources.cpp │ ├── profiler.cpp │ ├── registered_job.cpp │ ├── scheduler.cpp │ ├── scheduler_fifo.cpp │ ├── scheduler_fifo2.cpp │ ├── scheduler_full.cpp │ ├── scheduler_full2.cpp │ ├── scheduler_full3.cpp │ ├── server.cpp │ └── sm_resources.cpp ├── tests ├── CMakeLists.txt ├── client │ ├── CMakeLists.txt │ ├── client.cpp │ ├── client_concurrent_run_latencies.cpp │ ├── client_concurrent_run_latencies_set_load.cpp │ ├── client_concurrent_run_latencies_set_load_multi.cpp │ ├── client_concurrent_runs.cpp │ ├── client_single_latency.cpp │ └── raw_kernel_launch.cu ├── ipc │ ├── CMakeLists.txt │ └── shm_channel │ │ ├── CMakeLists.txt │ │ ├── latency │ │ ├── CMakeLists.txt │ │ ├── shmc_latency_read.cpp │ │ ├── shmc_latency_read_bare_atomic.cpp │ │ ├── shmc_latency_read_bare_atomic_loop.cpp │ │ ├── shmc_latency_read_loop.cpp │ │ ├── shmc_latency_write.cpp │ │ ├── shmc_latency_write_bare_atomic.cpp │ │ ├── shmc_latency_write_bare_atomic_loop.cpp │ │ ├── shmc_latency_write_loop.cpp │ │ └── shmpc_latency_gpu.cu │ │ ├── shmc_read.cpp │ │ ├── shmc_read_write.cpp │ │ ├── shmc_read_write_cpu_gpu.cu │ │ ├── shmc_read_write_same_proc.cpp │ │ └── shmc_write.cpp ├── simple │ ├── CMakeLists.txt │ ├── cuda_callback_benchmark.cu │ ├── cuda_sync_benchmark.cu │ ├── mmap_mlock_limit.cpp │ ├── tvm_direct_concurrent.cpp │ └── tvm_direct_multistream.cpp └── utils │ ├── CMakeLists.txt │ └── workload_pregen.cpp └── tools ├── calculate_jains_fairness_index.py ├── calculate_overhead_stack.py ├── cloudlab_setup.sh ├── parse_clockwork.py ├── parse_input_kelvin.py ├── parse_triton.py ├── plot_all_no_mnist.sh ├── plot_block_exec_times_cdf.py ├── plot_latency_fairness_threshold.py ├── plot_latency_throughput.py ├── plot_latency_throughput_subplots.py ├── plot_overhead_stack_graph.py ├── plot_resnet18_inception_v3.sh ├── run_all.sh ├── run_all_direct.sh ├── run_all_direct_multistream.sh ├── run_fairness_dummy.sh ├── run_fairness_mnist_inception_v3.sh ├── run_inception_v3.sh ├── run_mnist.sh ├── run_mnist_googlenet.sh ├── run_mnist_inception_v3.sh ├── run_mnist_resnet50.sh ├── run_mnist_sched_sleep.sh ├── run_mobilenet.sh ├── run_mobilenet_inception_v3.sh └── run_ultraface_arcface.sh /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(llis LANGUAGES CXX CUDA) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | cmake_policy(SET CMP0105 NEW) 8 | 9 | find_package(CUDAToolkit) 10 | 11 | include_directories(${CUDAToolkit_INCLUDE_DIRS}) 12 | 13 | find_package(Boost REQUIRED system context program_options) 14 | 15 | find_package(spdlog REQUIRED) 16 | 17 | option(TVM_INCLUDE_DIR "Headers directory of TVM") 18 | 19 | find_package(tvm) 20 | if(tvm_FOUND) 21 | if(TVM_INCLUDE_DIR) 22 | include_directories(PUBLIC ${TVM_INCLUDE_DIR}) 23 | endif(TVM_INCLUDE_DIR) 24 | endif(tvm_FOUND) 25 | 26 | option(MEASURE_BLOCK_TIME "Enable measurement of block time" OFF) 27 | if(MEASURE_BLOCK_TIME) 28 | add_definitions(-DLLIS_MEASURE_BLOCK_TIME) 29 | endif(MEASURE_BLOCK_TIME) 30 | 31 | option(FINISHED_BLOCK_NOTIFICATION_AGG "Aggregate block notifications" ON) 32 | if(FINISHED_BLOCK_NOTIFICATION_AGG) 33 | add_definitions(-DLLIS_FINISHED_BLOCK_NOTIFICATION_AGG) 34 | endif(FINISHED_BLOCK_NOTIFICATION_AGG) 35 | 36 | option(ENABLE_PROFILER "Enable profiler" OFF) 37 | if(ENABLE_PROFILER) 38 | add_definitions(-DLLIS_ENABLE_PROFILER) 39 | endif(ENABLE_PROFILER) 40 | 41 | option(PRINT_LAUNCH_JOB_IPC_LATENCY "Print launch job IPC latency" OFF) 42 | if(PRINT_LAUNCH_JOB_IPC_LATENCY) 43 | add_definitions(-DPRINT_LAUNCH_JOB_IPC_LATENCY) 44 | endif(PRINT_LAUNCH_JOB_IPC_LATENCY) 45 | 46 | set(CMAKE_INSTALL_RPATH $ORIGIN $ORIGIN/../lib) 47 | 48 | include_directories(include) 49 | 50 | add_subdirectory(src) 51 | add_subdirectory(app) 52 | add_subdirectory(tests) 53 | add_subdirectory(jobs) 54 | 55 | install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/ DESTINATION include) 56 | 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 University of Pennsylvania | Distributed Systems Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Paella / LLIS 2 | 3 | This project was called LLIS at the very beginning, and so this name is used in the codebase. 4 | 5 | ## SOSP 2023 Artifact Evaluation 6 | 7 | Please refer to the [instructions](sosp23_artifact/README.md) in the `sosp23_artifact/` directory. 8 | 9 | ## Dependencies 10 | 11 | 1. Linux (tested on Ubuntu 22.04) 12 | 1. NVIDIA driver (tested on 535.54.03) 13 | 1. CUDA runtime (tested on 12.2.0) 14 | 1. GCC (tested on 11.3.0) 15 | 1. CMake (tested on 3.22.1) 16 | 1. Boost (tested on 1.82.0) 17 | 1. LLVM / Clang (tested on 14) 18 | 1. spdlog (tested on 1.11.0; 1.12.0 is known to not work) 19 | 1. [**tvm-llis**](https://github.com/eniac/tvm-llis) (Custom version of TVM modified to work with Paella) 20 | 21 | ## Installation 22 | 23 | ### Paella/LLIS server and libraries 24 | 25 | ``` 26 | mkdir build 27 | cd build 28 | cmake -DCMAKE_BUILD_TYPE= -DCMAKE_CUDA_ARCHITECTURES= .. # cuda_arch is 60 for 6.0, 75 for 7.5, etc 29 | make -j$(nproc) install 30 | ``` 31 | 32 | ### Custom TVM (tvm-llis) 33 | 34 | Custom TVM depends on the libraries of Paella/LLIS. So, it can only be built after doing the previous step. 35 | 36 | Please refer to [README-llis.md](https://github.com/eniac/tvm-llis/blob/v0.10.0-llis/README-llis.md) of [tvm-llis](https://github.com/eniac/tvm-llis) for instructions. 37 | 38 | ### Paella/LLIS applications (e.g., client) and job adapters 39 | 40 | Applications and job adapters depend on the custom TVM. So, they can only be built after doing the previous step. 41 | 42 | ``` 43 | cmake .. -Utvm_FOUND # Find TVM again after we have installed it 44 | make -j$(nproc) install 45 | ``` 46 | 47 | -------------------------------------------------------------------------------- /app/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(llis_app_client client.cpp $ $) 2 | target_link_libraries(llis_app_client spdlog::spdlog dl rt) 3 | install(TARGETS llis_app_client DESTINATION bin) 4 | -------------------------------------------------------------------------------- /experiments/fig2/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | nvcc hol.cu -o fig2 -O3 -arch=sm_75 -Xptxas=-v 3 | 4 | reveng: reverse_eng.cu 5 | nvcc reverse_eng.cu -o reveng -O3 -arch=sm_75 -Xptxas=-v 6 | -------------------------------------------------------------------------------- /experiments/fig2/plot_qlen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import os 3 | import re 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | import argparse 8 | 9 | expected_jct = 316 * 8 # in ns 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('exp_labels', nargs='+', type=str) 13 | args = parser.parse_args() 14 | 15 | for exp_label in args.exp_labels: 16 | print(f'Plotting data for {exp_label}') 17 | re_string = f"^{exp_label}-\d+-qlen-results.csv$" 18 | valid = re.compile(re_string) 19 | result_files = [f for f in os.listdir(os.curdir) if os.path.isfile(f) and re.match(valid, f)] 20 | print(result_files) 21 | for f in result_files: 22 | df = pd.read_csv(f, delimiter='\t') 23 | plt.plot(df.TIME, df.QLEN, label=f.split('qlen')[0].split('-')[-2]) 24 | 25 | 26 | plt.legend() 27 | plt.xlabel('Sending time (seconds)') 28 | plt.ylabel('Queue length') 29 | 30 | fname = f"{'-'.join(args.exp_labels)}-qlen.pdf" 31 | print(f'Storing plot in {fname}') 32 | plt.savefig(fname) 33 | -------------------------------------------------------------------------------- /experiments/fig2/plot_ts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import re 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import argparse 9 | 10 | pd.set_option('display.max_rows', 500) 11 | 12 | expected_jct = 316 * 8 # in us 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('csvfile', type=str) 16 | parser.add_argument('-i', '--ideal-jct', type=int, help='Ideal JCT in us', default=expected_jct) 17 | args = parser.parse_args() 18 | 19 | df = pd.read_csv(args.csvfile, delimiter='\t') 20 | print(df) 21 | 22 | plt.plot(df.index, df.JCT, '.', label='jobs latency') 23 | 24 | if args.ideal_jct > 0: 25 | plt.plot(df.index, [args.ideal_jct for i in range(df.shape[0])], label='ideal job latency') 26 | 27 | plt.legend() 28 | plt.ylim(0) 29 | plt.xlabel('Job index') 30 | plt.ylabel('Latency (us)') 31 | fname = f'{args.csvfile}.pdf' 32 | print(f'Storing plot in {fname}') 33 | plt.savefig(fname) 34 | -------------------------------------------------------------------------------- /experiments/fig2/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import subprocess 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('exp_label', type=str) 9 | parser.add_argument('mode', type=str) 10 | parser.add_argument('--num_hwq', type=str, default=32) 11 | parser.add_argument('--iterate-hwq', action='store_true', default=False) 12 | args = parser.parse_args() 13 | 14 | def run_over_load(n_hwq: int): 15 | for i in range(100, 2000, 200): 16 | label = f'{args.exp_label}-{n_hwq}hwq' 17 | cmd_args = ['./fig2', args.mode, label, str(1e9/i)] # interval in ns 18 | p = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=dict(os.environ, CUDA_DEVICE_MAX_CONNECTIONS=str(n_hwq))) 19 | while(1): 20 | line = p.stdout.readline() 21 | print(line.decode('ascii')) 22 | if not line: 23 | break 24 | p.wait() 25 | 26 | if (args.iterate_hwq): 27 | n_hwq = 1 28 | while n_hwq <= 32: 29 | run_over_load(n_hwq) 30 | n_hwq *= 2 31 | else: 32 | run_over_load(args.num_hwq) 33 | -------------------------------------------------------------------------------- /experiments/triton/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | We use triton 23:03 to run our experiments. We configure it to use our TVM, a dependency for our models. 4 | We modify triton existing sample clients to use the same load generation logic we use across the paper. 5 | 6 | dependencies 7 | --- 8 | - Triton and our models need the same TVM 9 | - Triton and TVM need the same tensorflow 10 | 11 | # Server 12 | 13 | Run `./server.sh [models path] [TVM lib path] [ALLIS lib path] [CMake path]`. This is the command we ran for our experiments: 14 | ``` 15 | ./server.sh /home/maxdml/allis/models /home/maxdml/tvm_tf/ /home/kelvin/opt/cmake-3.22.3/ /home/kelvin/llis/ /home/kelvin/opt/boost-1.74.0/ 16 | ``` 17 | 18 | In the container, run: 19 | ``` 20 | LD_PRELOAD="/opt/tvm/build/libtvm_dso_op.so /opt/tritonserver/backends/tensorflow2/libtensorflow_cc.so /opt/tritonserver/backends/tensorflow2/libtensorflow_framework.so" tritonserver --model-repository=/models/newmix3/tensorflow --backend-config=tensorflow,version=2 --min-supported-compute-capability=7.5 --allow-grpc=true --backend-config=default-max-batch-size=0 21 | ``` 22 | 23 | # Client 24 | 25 | We use a custom client using triton's client framework. 26 | 27 | Build 28 | --- 29 | ``` 30 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON .. 31 | ``` 32 | 33 | Running 34 | --- 35 | - We use run.py to run experiments 36 | - run.py takes a config file describing the workload 37 | -------------------------------------------------------------------------------- /experiments/triton/server.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -ex 2 | 3 | MODELS_PATH=$1 # path to TF-wrapped models 4 | LIBTVM_PATH=$2 # path to our TVM 5 | CMAKE_PATH=$3 # path to our cmake 6 | LLIS_PATH=$4 # path to ALLIS libraries 7 | LIBBOOST_PATH=$5 8 | 9 | docker run -it --gpus=1 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p8000:8000 -p8001:8001 -p8002:8002 -v${MODELS_PATH}:/models -v${LIBTVM_PATH}:/opt/tvm -v${CMAKE_PATH}:/opt/cmake -v${LLIS_PATH}:/opt/allis -v${LIBBOOST_PATH}:/opt/boost nvcr.io/nvidia/tritonserver:latest 10 | -------------------------------------------------------------------------------- /include/llis/client/client.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace llis { 14 | namespace client { 15 | 16 | class Client { 17 | public: 18 | Client(std::string server_name); 19 | ~Client(); 20 | 21 | JobRef register_job(std::string path); 22 | 23 | ClientId get_client_id() const { 24 | return client_id_; 25 | } 26 | 27 | ipc::ShmChannelCpuWriter* get_c2s_channel() { 28 | return &c2s_channel_; 29 | } 30 | 31 | ipc::ShmChannelCpuReader* get_s2c_channel() { 32 | return &s2c_channel_; 33 | } 34 | 35 | void cuda_profiler_start(); 36 | void cuda_profiler_stop(); 37 | 38 | JobInstanceRef* add_job_instance_ref(JobInstanceRef job_instance_ref); 39 | void release_job_instance_ref(JobInstanceRef* job_instance_ref); 40 | 41 | JobInstanceRef* wait(); 42 | 43 | ProfilerClient* get_profiler_client() { 44 | return &profiler_client_; 45 | } 46 | 47 | void kill_server(); 48 | 49 | private: 50 | void generate_client_id(); 51 | void create_s2c_channel(); 52 | void reconnect_s2c_channel(); 53 | void register_client(); 54 | void connect_s2c_socket(); 55 | 56 | std::string server_name_; 57 | 58 | ClientId client_id_; 59 | 60 | std::string s2c_socket_prefix_; 61 | 62 | ipc::ShmChannelCpuWriter c2s_channel_; 63 | ipc::ShmChannelCpuReader s2c_channel_; 64 | ipc::UnixDatagramSocket s2c_socket_; 65 | 66 | std::vector> job_instance_refs_; 67 | std::vector unused_job_instance_refs_; 68 | 69 | std::mutex mtx_; 70 | 71 | ProfilerClient profiler_client_; 72 | }; 73 | 74 | } 75 | } 76 | 77 | -------------------------------------------------------------------------------- /include/llis/client/io_shm_entry.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace llis { 6 | namespace client { 7 | 8 | /* 9 | * ptr: the pointer to the memory in the local address space 10 | * id: identifier that identify the instance of mmap that the piece of memory belongs to 11 | * offset: one mmap can involve multiple MemoryEntry, and the offset denotes the part of the mmap. Offset is in bytes 12 | * (id, offset) <=> ptr 13 | */ 14 | struct IoShmEntry { 15 | void* ptr; 16 | int id; 17 | size_t offset; 18 | }; 19 | 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /include/llis/client/job_instance_ref.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace llis { 8 | namespace client { 9 | 10 | class JobRef; 11 | 12 | class JobInstanceRef { 13 | public: 14 | JobInstanceRef(JobRef* job_ref, IoShmEntry io_shm_entry); 15 | ~JobInstanceRef(); 16 | 17 | void launch(); 18 | void release(); 19 | 20 | void* get_input_ptr(); 21 | void* get_output_ptr(); 22 | 23 | void set_id(JobInstanceRefId id); 24 | JobInstanceRefId get_id() const; 25 | 26 | JobRefId get_job_ref_id() const; 27 | 28 | void set_start_time(double time_point); 29 | double get_start_time() const; 30 | 31 | private: 32 | JobRef* job_ref_; 33 | IoShmEntry io_shm_entry_; 34 | 35 | JobInstanceRefId id_; 36 | 37 | ipc::ShmChannelCpuWriter* c2s_channel_; 38 | 39 | double start_time_; 40 | }; 41 | 42 | } 43 | } 44 | 45 | -------------------------------------------------------------------------------- /include/llis/client/job_ref.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace llis { 12 | namespace client { 13 | 14 | class Client; 15 | 16 | class JobRef { 17 | public: 18 | JobRef(std::unique_ptr job, Client* client, std::string path); 19 | ~JobRef(); 20 | 21 | JobRef(const JobRef&) = delete; 22 | JobRef(JobRef&&) = default; 23 | JobRef& operator=(const JobRef&) = delete; 24 | JobRef& operator=(JobRef&&) = default; 25 | 26 | JobInstanceRef* create_instance(); 27 | void release_io_shm_entry(IoShmEntry io_shm_entry); 28 | 29 | job::Job* get_job() { 30 | return job_.get(); 31 | } 32 | 33 | Client* get_client() { 34 | return client_; 35 | } 36 | 37 | ClientId get_client_id() const { 38 | return client_id_; 39 | } 40 | 41 | JobRefId get_job_ref_id() const { 42 | return job_ref_id_; 43 | } 44 | 45 | ipc::ShmChannelCpuReader* get_s2c_channel() { 46 | return s2c_channel_; 47 | } 48 | 49 | ipc::ShmChannelCpuWriter* get_c2s_channel() { 50 | return c2s_channel_; 51 | } 52 | 53 | private: 54 | void register_job(); 55 | 56 | void grow_pool(size_t least_num_new_entries); 57 | void grow_pool(); 58 | 59 | std::unique_ptr job_; 60 | Client* client_; 61 | std::string model_path_; 62 | 63 | ipc::ShmChannelCpuReader* s2c_channel_; 64 | ipc::ShmChannelCpuWriter* c2s_channel_; 65 | ClientId client_id_; 66 | 67 | size_t pinned_mem_size_; 68 | size_t param_size_; 69 | 70 | size_t pool_size_ = 0; // number of concurrent instances that can be supported 71 | size_t pool_size_in_bytes_ = 0; // number of bytes of the pool 72 | 73 | std::vector pinned_mem_list_; 74 | std::vector pinned_mem_free_list_; 75 | 76 | std::vector param_mem_list_; 77 | std::vector param_mem_free_list_; 78 | 79 | std::string shm_name_; 80 | int shm_fd_; 81 | 82 | JobRefId job_ref_id_; 83 | }; 84 | 85 | } 86 | } 87 | 88 | -------------------------------------------------------------------------------- /include/llis/client/profiler_client.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace llis { 8 | namespace client { 9 | 10 | class ProfilerClient { 11 | public: 12 | ProfilerClient(ipc::ShmChannelCpuWriter* c2s_channel) : c2s_channel_(c2s_channel) {} 13 | 14 | void set_record_kernel_info(); 15 | void unset_record_kernel_info(); 16 | 17 | void set_record_block_exec_time(); 18 | void unset_record_block_exec_time(); 19 | 20 | void set_record_kernel_block_mis_alloc(); 21 | void unset_record_kernel_block_mis_alloc(); 22 | 23 | void set_record_run_next_times(); 24 | void unset_record_run_next_times(); 25 | 26 | void set_record_job_events(); 27 | void unset_record_job_events(); 28 | 29 | void set_record_resource_events(); 30 | void unset_record_resource_events(); 31 | 32 | void save(const std::string& path); 33 | 34 | private: 35 | ipc::ShmChannelCpuWriter* c2s_channel_; 36 | }; 37 | 38 | } 39 | } 40 | 41 | -------------------------------------------------------------------------------- /include/llis/ipc/atomic_lock.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | template 8 | class AtomicLock {}; 9 | 10 | template <> 11 | class AtomicLock { 12 | public: 13 | inline void acquire() { 14 | while (val_.test_and_set(std::memory_order_acquire)); 15 | } 16 | 17 | inline void release() { 18 | val_.clear(std::memory_order_release); 19 | } 20 | 21 | inline void init() { 22 | release(); 23 | } 24 | 25 | private: 26 | std::atomic_flag val_; 27 | }; 28 | 29 | template <> 30 | class AtomicLock { 31 | public: 32 | CUDA_HOSTDEV inline void acquire() { 33 | #ifdef __CUDA_ARCH__ 34 | while (atomicOr(&val_gpu_, 1)); 35 | #else 36 | while (val_cpu_.test_and_set(std::memory_order_acquire)); 37 | #endif 38 | } 39 | 40 | CUDA_HOSTDEV inline void release() { 41 | #ifdef __CUDA_ARCH__ 42 | val_gpu_ = 0; 43 | #else 44 | val_cpu_.clear(std::memory_order_release); 45 | #endif 46 | } 47 | 48 | inline void init() { 49 | *reinterpret_cast(&val_gpu_) = 0; 50 | } 51 | 52 | private: 53 | union { 54 | unsigned int val_gpu_; 55 | std::atomic_flag val_cpu_; 56 | }; 57 | }; 58 | 59 | 60 | -------------------------------------------------------------------------------- /include/llis/ipc/atomic_wrapper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | template 8 | class AtomicWrapper {}; 9 | 10 | template 11 | class AtomicWrapper { 12 | public: 13 | inline T load() const { 14 | return val_.load(std::memory_order_relaxed); 15 | } 16 | 17 | inline void store(T desired) { 18 | val_.store(desired, std::memory_order_relaxed); 19 | } 20 | 21 | inline void add(T val) { 22 | val_.fetch_add(val, std::memory_order_relaxed); 23 | } 24 | 25 | private: 26 | std::atomic val_; 27 | }; 28 | 29 | template 30 | class AtomicWrapper { 31 | public: 32 | CUDA_HOSTDEV inline T load() const { 33 | #ifdef __CUDA_ARCH__ 34 | return val_; 35 | #else 36 | static_assert(sizeof(std::atomic) == sizeof(T)); 37 | 38 | std::atomic* tmp = reinterpret_cast*>(const_cast(&val_)); 39 | return tmp->load(std::memory_order_relaxed); 40 | #endif 41 | } 42 | 43 | CUDA_HOSTDEV inline void store(T desired) { 44 | #ifdef __CUDA_ARCH__ 45 | val_ = desired; 46 | #else 47 | static_assert(sizeof(std::atomic) == sizeof(T)); 48 | 49 | std::atomic* tmp = reinterpret_cast*>(const_cast(&val_)); 50 | tmp->store(desired, std::memory_order_relaxed); 51 | #endif 52 | } 53 | 54 | CUDA_HOSTDEV inline void add(T val) { 55 | #ifdef __CUDA_ARCH__ 56 | // TODO: _system is necessary if both CPU and GPU are writing, but not sure if it is necessary if only GPU is writing and CPU is reading 57 | atomicAdd(const_cast(&val_), val); 58 | #else 59 | static_assert(sizeof(std::atomic) == sizeof(T)); 60 | 61 | std::atomic* tmp = reinterpret_cast*>(const_cast(&val_)); 62 | tmp->fetch_add(val, std::memory_order_relaxed); 63 | #endif 64 | } 65 | 66 | CUDA_HOSTDEV inline T inc(T compare) { 67 | #ifdef __CUDA_ARCH__ 68 | return atomicInc(const_cast(&val_), compare); 69 | #else 70 | // FIXME: make it actually atomic 71 | T old = val_; 72 | val_ = (old >= compare) ? 0 : (old + 1); 73 | return old; 74 | #endif 75 | } 76 | 77 | CUDA_HOSTDEV inline T cas(T compare, T val) { 78 | #ifdef __CUDA_ARCH__ 79 | return atomicCAS(const_cast(&val_), compare, val); 80 | #else 81 | // TODO 82 | #endif 83 | } 84 | 85 | private: 86 | volatile T val_; 87 | }; 88 | 89 | -------------------------------------------------------------------------------- /include/llis/ipc/defs.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace llis { 6 | 7 | using ClientId = uint32_t; 8 | using JobRefId = uint32_t; 9 | using JobInstanceRefId = uint32_t; 10 | using JobId = uint32_t; 11 | 12 | enum class MsgType : uint32_t { 13 | REGISTER_CLIENT, 14 | REGISTER_JOB, 15 | LAUNCH_JOB, 16 | GROW_POOL, 17 | CUDA_PROFILER_START, 18 | CUDA_PROFILER_STOP, 19 | PROFILER_CMD, 20 | EXIT_CMD 21 | }; 22 | 23 | enum class ProfilerMsgType : uint32_t { 24 | SET_RECORD_KERNEL_INFO, 25 | UNSET_RECORD_KERNEL_INFO, 26 | SET_RECORD_BLOCK_EXEC_TIME, 27 | UNSET_RECORD_BLOCK_EXEC_TIME, 28 | SET_RECORD_KERNEL_BLOCK_MIS_ALLOC, 29 | UNSET_RECORD_KERNEL_BLOCK_MIS_ALLOC, 30 | SET_RECORD_RUN_NEXT_TIMES, 31 | UNSET_RECORD_RUN_NEXT_TIMES, 32 | SET_RECORD_JOB_EVENTS, 33 | UNSET_RECORD_JOB_EVENTS, 34 | SET_RECORD_RESOURCE_EVENTS, 35 | UNSET_RECORD_RESOURCE_EVENTS, 36 | SAVE 37 | }; 38 | 39 | } 40 | 41 | -------------------------------------------------------------------------------- /include/llis/ipc/name_format.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace llis { 8 | namespace ipc { 9 | 10 | std::string s2c_socket_name(const std::string& server_name, ClientId client_id); 11 | std::string s2c_channel_name(const std::string& server_name, ClientId client_id); 12 | std::string c2s_channel_name(const std::string& server_name); 13 | 14 | } 15 | } 16 | 17 | -------------------------------------------------------------------------------- /include/llis/ipc/shm_primitive_channel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace llis { 9 | namespace ipc { 10 | 11 | template 12 | class ShmPrimitiveChannelBase { 13 | public: 14 | ShmPrimitiveChannelBase() : shm_(nullptr) {} 15 | ShmPrimitiveChannelBase(std::string name, size_t count = 0); 16 | ShmPrimitiveChannelBase(ShmPrimitiveChannelBase* channel) { 17 | connect(channel); 18 | } 19 | ShmPrimitiveChannelBase(size_t count) : ShmPrimitiveChannelBase("", count) {} 20 | ~ShmPrimitiveChannelBase(); 21 | 22 | ShmPrimitiveChannelBase(const ShmPrimitiveChannelBase&) = delete; 23 | ShmPrimitiveChannelBase& operator=(const ShmPrimitiveChannelBase&) = delete; 24 | 25 | ShmPrimitiveChannelBase(ShmPrimitiveChannelBase&&); 26 | ShmPrimitiveChannelBase& operator=(ShmPrimitiveChannelBase&&); 27 | 28 | void connect(std::string name, size_t count = 0); 29 | void connect(ShmPrimitiveChannelBase* channel); 30 | 31 | ShmPrimitiveChannelBase fork() { 32 | ShmPrimitiveChannelBase res; 33 | res.connect(this); 34 | return res; 35 | } 36 | 37 | void disconnect(); 38 | bool is_connected(); 39 | 40 | template 41 | CUDA_HOSTDEV U read(); 42 | template 43 | CUDA_HOSTDEV void write(U val); 44 | 45 | template 46 | CUDA_HOSTDEV bool can_read(); 47 | 48 | private: 49 | int fd_; 50 | char* shm_; 51 | T* ring_buf_; 52 | size_t count_; 53 | size_t total_size_; 54 | bool is_create_; 55 | std::string name_with_prefix_; 56 | 57 | unsigned read_pos_; 58 | AtomicWrapper* write_pos_; 59 | 60 | T cached_head_; 61 | }; 62 | 63 | template 64 | using ShmPrimitiveChannel = ShmPrimitiveChannelBase; 65 | template 66 | using ShmPrimitiveChannelGpu = ShmPrimitiveChannelBase; 67 | 68 | using Gpu2SchedChannel = ShmPrimitiveChannelGpu; 69 | 70 | } 71 | } 72 | 73 | #include "shm_primitive_channel_impl.h" 74 | 75 | -------------------------------------------------------------------------------- /include/llis/ipc/shm_primitive_channel_impl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | namespace llis { 16 | namespace ipc { 17 | 18 | template 19 | template 20 | CUDA_HOSTDEV U ShmPrimitiveChannelBase::read() { 21 | static_assert(sizeof(T) == sizeof(U), "The type being read must be of the same size as the type of the channel"); 22 | 23 | U* ptr = reinterpret_cast(ring_buf_ + read_pos_); 24 | U* cached_head_u_ptr = reinterpret_cast(&cached_head_); 25 | AtomicWrapper* ptr_atomic = reinterpret_cast*>(ptr); 26 | 27 | while (!cached_head_u_ptr->can_read()) { 28 | cached_head_ = ptr_atomic->load(); 29 | } 30 | 31 | cached_head_u_ptr->set_can_write(); 32 | ptr_atomic->store(cached_head_); 33 | 34 | if (read_pos_ == count_ - 1) { 35 | read_pos_ = 0; 36 | } else { 37 | ++read_pos_; 38 | } 39 | 40 | return *cached_head_u_ptr; 41 | } 42 | 43 | template 44 | template 45 | CUDA_HOSTDEV void ShmPrimitiveChannelBase::write(U val) { 46 | // TODO: it is probably possible to remove the critical session between acquire and store 47 | // Not sure which one has better performance 48 | 49 | static_assert(sizeof(T) == sizeof(U), "The type being written must be of the same size as the type of the channel"); 50 | 51 | size_t write_pos = write_pos_->inc(count_ - 1); 52 | U* ptr = reinterpret_cast(ring_buf_ + write_pos); 53 | 54 | reinterpret_cast*>(ptr)->store(*reinterpret_cast(&val)); 55 | } 56 | 57 | template 58 | template 59 | CUDA_HOSTDEV bool ShmPrimitiveChannelBase::can_read() { 60 | U* ptr = reinterpret_cast(ring_buf_ + read_pos_); 61 | cached_head_ = reinterpret_cast*>(ptr)->load(); 62 | 63 | return reinterpret_cast(&cached_head_)->can_read(); 64 | } 65 | 66 | 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /include/llis/ipc/threadfence_wrapper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | template 8 | class ThreadfenceWrapper {}; 9 | 10 | template 11 | class ThreadfenceWrapper { 12 | public: 13 | inline T load() const { 14 | return val_.load(std::memory_order_acquire); 15 | } 16 | 17 | inline void store(T desired) { 18 | val_.store(desired, std::memory_order_release); 19 | } 20 | 21 | private: 22 | std::atomic val_; 23 | }; 24 | 25 | template 26 | class ThreadfenceWrapper { 27 | public: 28 | CUDA_HOSTDEV inline T load() const { 29 | #ifdef __CUDA_ARCH__ 30 | T val = val_; 31 | __threadfence_system(); 32 | return val; 33 | #else 34 | std::atomic* tmp = reinterpret_cast*>(const_cast(&val_)); 35 | return tmp->load(std::memory_order_acquire); 36 | #endif 37 | } 38 | 39 | CUDA_HOSTDEV inline void store(T desired) { 40 | #ifdef __CUDA_ARCH__ 41 | __threadfence_system(); 42 | val_ = desired; 43 | #else 44 | std::atomic* tmp = reinterpret_cast*>(const_cast(&val_)); 45 | tmp->store(desired, std::memory_order_release); 46 | #endif 47 | } 48 | 49 | private: 50 | volatile T val_; 51 | }; 52 | 53 | -------------------------------------------------------------------------------- /include/llis/ipc/unix_datagram_socket.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace llis { 8 | namespace ipc { 9 | 10 | class UnixDatagramSocket { 11 | public: 12 | UnixDatagramSocket(); 13 | UnixDatagramSocket(const std::string& name); 14 | 15 | UnixDatagramSocket(UnixDatagramSocket&&); 16 | UnixDatagramSocket& operator=(UnixDatagramSocket&&); 17 | 18 | ~UnixDatagramSocket(); 19 | 20 | void bind(const std::string& name); 21 | UnixDatagramSocket connect(const std::string& name); 22 | 23 | ssize_t write(const void* buf, size_t count); 24 | ssize_t read(void* buf, size_t count); 25 | 26 | private: 27 | UnixDatagramSocket(int socket); 28 | 29 | int socket_; 30 | bool is_owner_; 31 | 32 | sockaddr_un remote_addr_; 33 | }; 34 | 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /include/llis/job/context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace llis { 8 | namespace job { 9 | 10 | class Context { 11 | public: 12 | static Job* get_current_job() { 13 | return current_job_; 14 | } 15 | 16 | static void set_current_job(Job* job) { 17 | current_job_ = job; 18 | } 19 | 20 | static void set_gpu2sched_channel(ipc::Gpu2SchedChannel* gpu2sched_channel) { 21 | gpu2sched_channel_ = gpu2sched_channel->fork(); 22 | } 23 | 24 | static ipc::Gpu2SchedChannel* get_gpu2sched_channel() { 25 | return &gpu2sched_channel_; 26 | } 27 | 28 | #ifdef LLIS_MEASURE_BLOCK_TIME 29 | static void set_gpu2sched_block_time_channel(ipc::Gpu2SchedChannel* gpu2sched_block_time_channel) { 30 | gpu2sched_block_time_channel_ = gpu2sched_block_time_channel->fork(); 31 | } 32 | 33 | static ipc::Gpu2SchedChannel* get_gpu2sched_block_time_channel() { 34 | return &gpu2sched_block_time_channel_; 35 | } 36 | #endif 37 | 38 | static void set_mem2sched_channel(ipc::ShmChannelCpuReader* mem2sched_channel) { 39 | mem2sched_channel_ = mem2sched_channel->fork(); 40 | } 41 | 42 | static ipc::ShmChannelCpuWriter* get_mem2sched_channel() { 43 | return &mem2sched_channel_; 44 | } 45 | 46 | private: 47 | static Job* current_job_; 48 | static ipc::Gpu2SchedChannel gpu2sched_channel_; 49 | #ifdef LLIS_MEASURE_BLOCK_TIME 50 | static ipc::Gpu2SchedChannel gpu2sched_block_time_channel_; 51 | #endif 52 | static ipc::ShmChannelCpuWriter mem2sched_channel_; 53 | }; 54 | 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /include/llis/job/coroutine_job.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace llis { 7 | namespace job { 8 | 9 | class CoroutineJob : public Job { 10 | public: 11 | void full_init(void* io_ptr) final { 12 | one_time_init(); 13 | init(io_ptr); 14 | } 15 | 16 | void init(void* io_ptr) final { 17 | // Note that this will run the coroutine immediately. This is necessary because we need the coroutine to setup resource requirements for the first kernel 18 | 19 | coroutine_pull_ = std::make_unique::pull_type>([this, io_ptr](boost::coroutines2::coroutine::push_type& coroutine_push) { 20 | coroutine_push_ = &coroutine_push; 21 | body(io_ptr); 22 | }); 23 | } 24 | 25 | void run_next() final { 26 | (*coroutine_pull_)(); 27 | } 28 | 29 | virtual void one_time_init() = 0; 30 | 31 | virtual void body(void* io_ptr) = 0; 32 | 33 | bool has_next() const final { 34 | return (bool)(*coroutine_pull_); 35 | } 36 | 37 | void yield() { 38 | (*coroutine_push_)(); 39 | } 40 | 41 | private: 42 | std::unique_ptr::pull_type> coroutine_pull_; 43 | boost::coroutines2::coroutine::push_type* coroutine_push_; 44 | }; 45 | 46 | } 47 | } 48 | 49 | -------------------------------------------------------------------------------- /include/llis/job/instrument.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace llis { 7 | namespace job { 8 | 9 | __device__ inline void kernel_start(JobId job_id, ipc::Gpu2SchedChannel* gpu2sched_channel 10 | #ifdef LLIS_MEASURE_BLOCK_TIME 11 | , BlockStartEndTime* start_end_time 12 | #endif 13 | ) { 14 | if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { 15 | #ifdef LLIS_MEASURE_BLOCK_TIME 16 | unsigned clock_val = clock64() >> 8; 17 | clock_val &= 0xFFFFFF; 18 | start_end_time->data[0] = clock_val >> 8; 19 | start_end_time->data[1] = (clock_val & 0xFF) << 8; 20 | #endif 21 | 22 | unsigned smid; 23 | asm("mov.u32 %0, %smid;" : "=r"(smid)); 24 | 25 | InstrumentInfo info; 26 | info.is_start = 1; 27 | info.smid = smid; 28 | info.job_id = job_id; 29 | 30 | gpu2sched_channel->write(info); 31 | } 32 | } 33 | 34 | __device__ inline void kernel_end(JobId job_id, ipc::Gpu2SchedChannel* gpu2sched_channel 35 | #ifdef LLIS_MEASURE_BLOCK_TIME 36 | , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel 37 | , BlockStartEndTime* start_end_time 38 | #endif 39 | ) { 40 | __syncthreads(); 41 | 42 | if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { 43 | #ifdef LLIS_MEASURE_BLOCK_TIME 44 | unsigned clock_val = clock64() >> 8; 45 | clock_val &= 0xFFFFFF; 46 | start_end_time->data[1] |= clock_val >> 16; 47 | start_end_time->data[2] = clock_val & 0xFFFF; 48 | #endif 49 | 50 | unsigned smid; 51 | asm("mov.u32 %0, %smid;" : "=r"(smid)); 52 | 53 | InstrumentInfo info; 54 | info.is_start = 0; 55 | info.smid = smid; 56 | info.job_id = job_id; 57 | 58 | gpu2sched_channel->write(info); 59 | #ifdef LLIS_MEASURE_BLOCK_TIME 60 | gpu2sched_block_time_channel->write(*start_end_time); 61 | #endif 62 | } 63 | } 64 | 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /include/llis/job/instrument_info.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace llis { 7 | namespace job { 8 | 9 | class InstrumentInfo { 10 | public: 11 | uint8_t is_start; 12 | uint8_t smid; 13 | uint8_t num; 14 | private: 15 | uint8_t status_; 16 | public: 17 | JobId job_id; 18 | 19 | CUDA_HOSTDEV InstrumentInfo() { 20 | set_can_read(); 21 | } 22 | 23 | CUDA_HOSTDEV bool can_read() const { 24 | return status_ == 1; 25 | } 26 | 27 | CUDA_HOSTDEV bool can_write() const { 28 | return status_ == 0; 29 | } 30 | 31 | CUDA_HOSTDEV void set_can_read() { 32 | status_ = 1; 33 | } 34 | 35 | CUDA_HOSTDEV void set_can_write() { 36 | status_ = 0; 37 | } 38 | }; 39 | 40 | #ifdef LLIS_MEASURE_BLOCK_TIME 41 | 42 | class BlockStartEndTime { 43 | private: 44 | uint8_t status_; 45 | uint8_t dummy_; 46 | public: 47 | uint16_t data[3]; 48 | 49 | CUDA_HOSTDEV BlockStartEndTime() { 50 | set_can_read(); 51 | } 52 | 53 | CUDA_HOSTDEV bool can_read() const { 54 | return status_ == 1; 55 | } 56 | 57 | CUDA_HOSTDEV bool can_write() const { 58 | return status_ == 0; 59 | } 60 | 61 | CUDA_HOSTDEV void set_can_read() { 62 | status_ = 1; 63 | } 64 | 65 | CUDA_HOSTDEV void set_can_write() { 66 | status_ = 0; 67 | } 68 | }; 69 | 70 | #endif 71 | 72 | } 73 | } 74 | 75 | -------------------------------------------------------------------------------- /include/llis/job/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace llis { 6 | namespace job { 7 | 8 | void memset_res(size_t count, Job* job); 9 | void memset(void* ptr, int val, size_t count, Job* job, ipc::ShmChannelGpu* gpu2sched_channel); 10 | 11 | } 12 | } 13 | 14 | -------------------------------------------------------------------------------- /include/llis/server/client_connection.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace llis { 8 | namespace server { 9 | 10 | class ClientConnection { 11 | public: 12 | ClientConnection(ClientId client_id) : client_id_(client_id) {} 13 | 14 | void use_s2c_channel(ipc::ShmChannelCpuWriter&& s2c_channel) { 15 | s2c_channel_ = std::move(s2c_channel); 16 | } 17 | 18 | void use_s2c_socket(ipc::UnixDatagramSocket&& sock) { 19 | s2c_socket_ = std::move(sock); 20 | } 21 | 22 | ipc::ShmChannelCpuWriter* get_s2c_channel() { 23 | return &s2c_channel_; 24 | } 25 | 26 | ipc::UnixDatagramSocket* get_s2c_socket() { 27 | return &s2c_socket_; 28 | } 29 | 30 | ClientId get_client_id() const { 31 | return client_id_; 32 | } 33 | 34 | private: 35 | ClientId client_id_; 36 | 37 | ipc::ShmChannelCpuWriter s2c_channel_; 38 | ipc::UnixDatagramSocket s2c_socket_; 39 | }; 40 | 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /include/llis/server/gpu_resources.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace llis { 8 | namespace server { 9 | 10 | class GpuResources { 11 | public: 12 | GpuResources(); 13 | 14 | void acquire(int smid, job::Job* job, int num); 15 | void release(int smid, job::Job* job, int num); 16 | 17 | bool job_fits(job::Job* job) const; 18 | 19 | unsigned get_num_sms() const { 20 | return sms_resources_.size(); 21 | } 22 | 23 | bool is_full() const { 24 | return num_full_sms_ >= sms_resources_.size(); 25 | } 26 | 27 | void choose_sms(job::Job* job); 28 | 29 | double dot(job::Job* job) const; 30 | double dot_normalized(job::Job* job) const; 31 | float normalize_resources(job::Job* job) const; 32 | 33 | #ifdef LLIS_ENABLE_PROFILER 34 | void set_profiler(Profiler* profiler) { 35 | for (auto& sm_resources : sms_resources_) { 36 | sm_resources.set_profiler(profiler); 37 | } 38 | } 39 | #endif 40 | 41 | private: 42 | std::vector sms_resources_; 43 | SmResources total_resources_; 44 | unsigned num_full_sms_ = 0; 45 | 46 | std::vector gpc_num_blocks_; 47 | std::vector gpc_next_sms_; 48 | // TODO: detect the actual allocation 49 | constexpr static unsigned gpc_sms_[5][8] = {{0, 10, 20, 30, 1, 11, 21, 31}, {2, 12, 22, 32, 3, 13, 23, 33}, {4, 14, 24, 34, 5, 15, 25, 35}, {6, 16, 26, 36, 7, 17, 27, 37}, {8, 18, 28, 38, 9, 19, 29, 39}}; 50 | }; 51 | 52 | } 53 | } 54 | 55 | -------------------------------------------------------------------------------- /include/llis/server/registered_job.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace llis { 11 | namespace server { 12 | 13 | class RegisteredJob { 14 | public: 15 | RegisteredJob(JobRefId registered_job_id, 16 | ipc::ShmChannelCpuReader* c2s_channel_, 17 | ClientConnection* client_connection); 18 | RegisteredJob(RegisteredJob&&) = default; 19 | 20 | void init(ipc::ShmChannelCpuReader* c2s_channel, 21 | ClientConnection* client_connection); 22 | 23 | std::unique_ptr create_instance(); 24 | void grow_pool(); 25 | std::unique_ptr init_job(); 26 | void release_instance(std::unique_ptr job); 27 | 28 | void update_stage_length(unsigned stage_id, double len); 29 | void set_stage_resource(unsigned stage_id, float res); 30 | bool has_stage_resource(unsigned stage_id); 31 | double get_stage_length(unsigned stage_id) const; 32 | float get_stage_resource(unsigned stage_id) const; 33 | double get_remaining_length(unsigned from_stage) const; 34 | double get_remaining_rl(unsigned from_stage) const; 35 | 36 | const std::vector& get_stage_lengths() const { 37 | return stage_lengths_; 38 | } 39 | 40 | const std::vector& get_stage_resources() const { 41 | return stage_resources_; 42 | } 43 | 44 | private: 45 | typedef job::Job* (*init_job_t)(); 46 | 47 | JobRefId registered_job_id_; 48 | ipc::ShmChannelCpuReader* c2s_channel_; 49 | ClientConnection* client_connection_; 50 | 51 | ipc::ShmChannelCpuWriter* s2c_channel_; 52 | init_job_t init_job_; 53 | job::Job* job_; 54 | std::string shm_name_; 55 | int shm_fd_; 56 | 57 | size_t pool_size_in_bytes_; 58 | std::vector mapped_mem_; 59 | 60 | std::vector> unused_job_instances_; 61 | 62 | std::vector stage_lengths_; 63 | #ifdef PRINT_STAGE_LENGTH_STDDEV 64 | std::vector stage_lengths_sum_; 65 | std::vector stage_lengths_sum_sqr_; 66 | std::vector stage_lengths_num_; 67 | #endif 68 | std::vector stage_resources_; 69 | }; 70 | 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /include/llis/server/scheduler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace llis { 13 | 14 | namespace po = boost::program_options; 15 | 16 | namespace server { 17 | 18 | class Scheduler { 19 | public: 20 | virtual void set_server(Server* server) { 21 | server_ = server; 22 | profiler_ = server_->get_profiler(); 23 | } 24 | 25 | virtual void try_handle_block_start_finish() = 0; 26 | virtual void handle_new_job(std::unique_ptr job) = 0; 27 | 28 | protected: 29 | Server* server_; 30 | Profiler* profiler_; 31 | }; 32 | 33 | class SchedulerFactory { 34 | public: 35 | using RegisterFunc = std::function(const po::variables_map&)>; 36 | 37 | static bool register_scheduler(std::string name, RegisterFunc func); 38 | static std::unique_ptr create(std::string name, const po::variables_map& args); 39 | 40 | private: 41 | static std::unordered_map registered_schedulers_; 42 | }; 43 | 44 | #define LLIS_SCHEDULER_REGISTER(name, args) \ 45 | static bool __scheduler_register_ = llis::server::SchedulerFactory::register_scheduler(name, args); 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /include/llis/server/scheduler_fifo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define GPU2SCHED_CHAN_SIZE 1024000 20 | #define GPU2SCHED_CHAN_SIZE_TIME 10240000 21 | 22 | namespace llis { 23 | namespace server { 24 | 25 | class SchedulerFifo : public Scheduler { 26 | public: 27 | SchedulerFifo(unsigned num_streams, unsigned sched_sleep); 28 | 29 | void handle_new_job(std::unique_ptr job) override; 30 | void try_handle_block_start_finish() override; 31 | 32 | private: 33 | void handle_block_start_finish(); 34 | #ifdef LLIS_MEASURE_BLOCK_TIME 35 | void handle_block_start_end_time(); 36 | #endif 37 | void handle_block_start(const job::InstrumentInfo& info); 38 | void handle_block_finish(const job::InstrumentInfo& info); 39 | void handle_mem_finish(); 40 | 41 | void schedule_job(); 42 | 43 | static void mem_notification_callback(void* job); 44 | 45 | ipc::ShmPrimitiveChannelGpu gpu2sched_channel_; 46 | #ifdef LLIS_MEASURE_BLOCK_TIME 47 | ipc::ShmPrimitiveChannelGpu gpu2sched_block_time_channel_; 48 | #endif 49 | ipc::ShmChannelCpuReader mem2sched_channel_; 50 | 51 | std::vector cuda_streams_; 52 | job::FinishedBlockNotifier* finished_block_notifiers_raw_; 53 | std::vector finished_block_notifiers_; 54 | 55 | std::queue job_queue_; 56 | 57 | std::vector> job_id_to_job_map_; 58 | std::vector unused_job_id_; 59 | 60 | std::vector remaining_num_blocks_; 61 | std::vector pre_notify_blocks_; 62 | std::vector pre_notify_sent_; 63 | 64 | unsigned num_jobs_ = 0; 65 | 66 | #ifdef PRINT_NUM_RUNNING_JOBS 67 | unsigned num_running_jobs_ = 0; 68 | #endif 69 | #ifdef PRINT_NUM_RUNNING_BLOCKS 70 | unsigned num_running_blocks_ = 0; 71 | #endif 72 | }; 73 | 74 | } 75 | } 76 | 77 | -------------------------------------------------------------------------------- /include/llis/server/scheduler_fifo2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define GPU2SCHED_CHAN_SIZE 1024000 20 | #define GPU2SCHED_CHAN_SIZE_TIME 10240000 21 | 22 | namespace llis { 23 | namespace server { 24 | 25 | class SchedulerFifo2 : public Scheduler { 26 | public: 27 | SchedulerFifo2(unsigned num_streams, unsigned sched_sleep); 28 | 29 | void handle_new_job(std::unique_ptr job) override; 30 | void try_handle_block_start_finish() override; 31 | 32 | private: 33 | class JobCompare { 34 | public: 35 | bool operator() (const job::Job* left, const job::Job* right) const { 36 | return left->get_unique_id() > right->get_unique_id(); 37 | } 38 | }; 39 | 40 | void handle_block_start_finish(); 41 | #ifdef LLIS_MEASURE_BLOCK_TIME 42 | void handle_block_start_end_time(); 43 | #endif 44 | void handle_block_start(const job::InstrumentInfo& info); 45 | void handle_block_finish(const job::InstrumentInfo& info); 46 | void handle_mem_finish(); 47 | 48 | void schedule_job(); 49 | 50 | static void mem_notification_callback(void* job); 51 | 52 | ipc::ShmPrimitiveChannelGpu gpu2sched_channel_; 53 | #ifdef LLIS_MEASURE_BLOCK_TIME 54 | ipc::ShmPrimitiveChannelGpu gpu2sched_block_time_channel_; 55 | #endif 56 | ipc::ShmChannelCpuReader mem2sched_channel_; 57 | 58 | std::vector cuda_streams_; 59 | job::FinishedBlockNotifier* finished_block_notifiers_raw_; 60 | std::vector finished_block_notifiers_; 61 | 62 | std::priority_queue, JobCompare> job_queue_; 63 | 64 | std::vector> job_id_to_job_map_; 65 | std::vector unused_job_id_; 66 | 67 | unsigned num_jobs_ = 0; 68 | 69 | #ifdef PRINT_NUM_RUNNING_KERNELS 70 | unsigned num_running_kernels_ = 0; 71 | unsigned num_running_mems_ = 0; 72 | #endif 73 | }; 74 | 75 | } 76 | } 77 | 78 | -------------------------------------------------------------------------------- /include/llis/server/server.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #define SER2SCHED_CHAN_SIZE 1024 15 | #define CLT2SCHED_CHAN_SIZE 1024 16 | 17 | namespace llis { 18 | namespace server { 19 | 20 | class Scheduler; 21 | 22 | constexpr size_t s2c_channel_size = 4096; 23 | 24 | class Server { 25 | public: 26 | Server(std::string server_name, Scheduler* scheduler); 27 | 28 | void serve(); 29 | 30 | void notify_job_starts(job::Job* job); 31 | void notify_job_ends(job::Job* job); 32 | 33 | void release_job_instance(std::unique_ptr job); 34 | 35 | void update_job_stage_length(job::Job* job, unsigned stage_id, double len); 36 | void set_job_stage_resource(job::Job* job, unsigned stage_id, float res); 37 | bool has_job_stage_resource(job::Job* job, unsigned stage_id); 38 | 39 | double get_job_remaining_length(job::Job* job, unsigned from_stage) const; 40 | double get_job_remaining_rl(job::Job* job, unsigned from_stage) const; 41 | 42 | const std::vector& get_job_stage_lengths(job::Job* job) const; 43 | const std::vector& get_job_stage_resources(job::Job* job) const; 44 | 45 | Profiler* get_profiler() { 46 | return &profiler_; 47 | } 48 | 49 | private: 50 | void try_handle_c2s(); 51 | void handle_c2s(); 52 | void handle_register_client(); 53 | void handle_register_job(); 54 | void handle_launch_job(); 55 | void handle_grow_pool(); 56 | void handle_release_job_instance(); 57 | 58 | std::string server_name_; 59 | Scheduler* scheduler_; 60 | ipc::UnixDatagramSocket s2c_socket_; 61 | 62 | ipc::ShmChannelCpuReader c2s_channel_; 63 | 64 | std::vector client_connections_; 65 | std::vector unused_client_connections_; 66 | 67 | std::vector registered_jobs_; 68 | std::vector unused_registered_jobs_; 69 | 70 | Profiler profiler_; 71 | }; 72 | 73 | } 74 | } 75 | 76 | -------------------------------------------------------------------------------- /include/llis/server/sm_resources.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace llis { 7 | namespace server { 8 | 9 | class SmResources { 10 | public: 11 | SmResources(int nregs, int smem, int nthrs, int nblocks); 12 | SmResources(); 13 | 14 | void acquire(job::Job* job, int num); 15 | void release(job::Job* job, int num); 16 | 17 | double dot(job::Job* job) const; 18 | double dot_normalized(job::Job* job) const; 19 | float normalize_resources(job::Job* job) const; 20 | 21 | unsigned num_blocks(job::Job* job) const; 22 | 23 | double occupancy() const; 24 | 25 | bool is_full() const { 26 | return nregs_ <= 0 || smem_ <= 0 || nthrs_ <= 0 || nblocks_ <= 0; 27 | } 28 | 29 | bool job_fits(job::Job* job) const { 30 | return nregs_ >= (int)job->get_num_registers_per_thread() * (int)job->get_num_threads_per_block() * (int)job->get_num_blocks() && smem_ >= (int)job->get_smem_size_per_block() * (int)job->get_num_blocks() && nthrs_ >= (int)job->get_num_threads_per_block() * (int)job->get_num_blocks() && nblocks_ >= (int)job->get_num_blocks(); 31 | } 32 | 33 | #ifdef LLIS_ENABLE_PROFILER 34 | void set_profiler(Profiler* profiler) { 35 | profiler_ = profiler; 36 | } 37 | #endif 38 | 39 | private: 40 | int nregs_ = 0; 41 | int smem_ = 0; 42 | int nthrs_ = 0; 43 | int nblocks_ = 0; 44 | 45 | int max_nregs_ = 0; 46 | int max_smem_ = 0; 47 | int max_nthrs_ = 0; 48 | int max_nblocks_ = 0; 49 | 50 | double max_resources_dot_prod_; 51 | 52 | Profiler* profiler_; 53 | }; 54 | 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /include/llis/utils/align.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | namespace llis { 9 | namespace utils { 10 | 11 | CUDA_HOSTDEV inline size_t next_aligned_pos(size_t next_pos, size_t align) { 12 | return (next_pos + align - 1) & ~(align - 1); 13 | } 14 | 15 | template 16 | CUDA_HOSTDEV inline T* next_aligned_ptr(T* next_ptr, size_t align) { 17 | return reinterpret_cast(reinterpret_cast(next_ptr) + align - 1) & ~(align - 1); 18 | } 19 | 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /include/llis/utils/gpu.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // From https://stackoverflow.com/a/32015007/1213644 4 | #ifdef __CUDACC__ 5 | #define CUDA_HOSTDEV __host__ __device__ 6 | #else 7 | #define CUDA_HOSTDEV 8 | #endif 9 | 10 | -------------------------------------------------------------------------------- /include/llis/utils/ops.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ops.h - useful x86_64 instructions 3 | */ 4 | 5 | #pragma once 6 | 7 | static inline uint64_t rdtsc(void) { 8 | uint32_t a, d; 9 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 10 | return ((uint64_t)a) | (((uint64_t)d) << 32); 11 | } 12 | 13 | static inline uint64_t rdtscp(uint32_t *auxp) { 14 | uint32_t a, d, c; 15 | asm volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); 16 | if (auxp) 17 | *auxp = c; 18 | return ((uint64_t)a) | (((uint64_t)d) << 32); 19 | } 20 | -------------------------------------------------------------------------------- /include/llis/utils/path.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace llis { 7 | namespace utils { 8 | 9 | namespace internal { 10 | 11 | template 12 | std::filesystem::path path_concat_internal(T path_str) { 13 | return std::filesystem::path(path_str); 14 | } 15 | 16 | template 17 | std::filesystem::path path_concat_internal(T path_str, Args... paths_str) { 18 | std::filesystem::path res = path_concat_internal(paths_str...); 19 | res = std::filesystem::path(path_str) / res; 20 | return res; 21 | } 22 | 23 | } 24 | 25 | template 26 | std::string path_concat(Args... paths_str) { 27 | return internal::path_concat_internal(paths_str...).string(); 28 | } 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /include/llis/utils/time.hh: -------------------------------------------------------------------------------- 1 | #ifndef LLIS_TIME_H_IS_INCLUDED 2 | #define LLIS_TIME_H_IS_INCLUDED 3 | 4 | #define CPU_FREQ 2.5 // Adjust with your cpu speed 5 | 6 | #include 7 | #include 8 | using hr_clock = std::chrono::steady_clock; 9 | typedef hr_clock::time_point tp; 10 | 11 | uint64_t since_epoch(const tp &time); 12 | uint64_t ns_diff(const tp &start, const tp &end); 13 | 14 | static const auto system_start_time = hr_clock::now(); 15 | 16 | static inline uint64_t cycles_to_ns(uint64_t time) { 17 | return time / CPU_FREQ; 18 | } 19 | 20 | #endif /* LLIS_TIME_H_IS_INCLUDED */ 21 | -------------------------------------------------------------------------------- /jobs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CMAKE_INSTALL_RPATH $ORIGIN $ORIGIN/..) 2 | 3 | add_subdirectory(helloworld) 4 | add_subdirectory(helloworld_coroutine) 5 | add_subdirectory(vec_add_coroutine) 6 | add_subdirectory(run_forever) 7 | add_subdirectory(dummy_long) 8 | add_subdirectory(dummy_short) 9 | add_subdirectory(dummy_10) 10 | add_subdirectory(dummy_11) 11 | add_subdirectory(dummy_20) 12 | add_subdirectory(dummy_21) 13 | # Have to fix it before running it 14 | #add_subdirectory(cnn) 15 | 16 | if(tvm_FOUND) 17 | add_subdirectory(tvm_mnist) 18 | add_subdirectory(tvm_mobilenet) 19 | add_subdirectory(tvm_resnet18) 20 | add_subdirectory(tvm_inception_v3) 21 | add_subdirectory(tvm_googlenet) 22 | add_subdirectory(tvm_ultraface320) 23 | add_subdirectory(tvm_densenet121) 24 | add_subdirectory(tvm_arcfaceresnet100) 25 | add_subdirectory(tvm_resnet50) 26 | add_subdirectory(tvm_resnet34) 27 | add_subdirectory(tvm_squeezenet1_1) 28 | endif(tvm_FOUND) 29 | 30 | -------------------------------------------------------------------------------- /jobs/cnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_cnn SHARED main.cu layer.cu $) 2 | set_target_properties(job_cnn PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_libraries(job_cnn llis_job_gpu llis_context) 4 | -------------------------------------------------------------------------------- /jobs/dummy_10/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_dummy_10 SHARED dummy.cu $) 2 | set_target_properties(job_dummy_10 PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_options(job_dummy_10 PRIVATE -fvisibility=hidden) 4 | target_link_libraries(job_dummy_10 llis_job_gpu llis_context) 5 | -------------------------------------------------------------------------------- /jobs/dummy_10/dummy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 9 | notifier->start(job_id); 10 | 11 | //clock_t start_time = clock64(); 12 | //while (clock64() - start_time < 10000000); 13 | 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | unsigned grid_size = blockDim.x * gridDim.x; 16 | 17 | while (id < count) { 18 | float tmp = 1; 19 | for (unsigned i = 1; i <= compute_count; ++i) { 20 | tmp *= i; 21 | } 22 | mem[id] = tmp; 23 | id += grid_size; 24 | } 25 | 26 | notifier->end(job_id); 27 | } 28 | 29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob { 30 | public: 31 | size_t get_input_size() override { 32 | return 5; 33 | } 34 | 35 | size_t get_output_size() override { 36 | return 11; 37 | } 38 | 39 | size_t get_param_size() override { 40 | return 4; 41 | } 42 | 43 | void one_time_init() override { 44 | set_num_threads_per_block(256); 45 | set_smem_size_per_block(0); 46 | set_num_registers_per_thread(32); 47 | set_num_blocks(5); 48 | unset_is_mem(); 49 | 50 | cudaMalloc(&mem_, count_ * sizeof(*mem_)); 51 | } 52 | 53 | void body(void* io_ptr) override { 54 | for (int i = 0; i < num_kernels; ++i) { 55 | if (i == num_kernels - 1) { 56 | set_pre_notify(); 57 | } 58 | yield(); 59 | llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier(); 60 | dummy_kernel<<>>(mem_, count_, compute_count_, get_id(), notifier); 61 | } 62 | } 63 | 64 | private: 65 | float* mem_; 66 | 67 | static constexpr unsigned count_ = 5000000; 68 | static constexpr unsigned compute_count_ = 100; 69 | static constexpr unsigned num_kernels = 10; 70 | }; 71 | 72 | extern "C" { 73 | 74 | __attribute__((visibility("default"))) 75 | llis::job::Job* init_job() { 76 | return new DummyShortCoroutineJob(); 77 | } 78 | 79 | } 80 | 81 | -------------------------------------------------------------------------------- /jobs/dummy_11/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_dummy_11 SHARED dummy.cu $) 2 | set_target_properties(job_dummy_11 PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_options(job_dummy_11 PRIVATE -fvisibility=hidden) 4 | target_link_libraries(job_dummy_11 llis_job_gpu llis_context) 5 | -------------------------------------------------------------------------------- /jobs/dummy_11/dummy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 9 | notifier->start(job_id); 10 | 11 | //clock_t start_time = clock64(); 12 | //while (clock64() - start_time < 10000000); 13 | 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | unsigned grid_size = blockDim.x * gridDim.x; 16 | 17 | while (id < count) { 18 | float tmp = 1; 19 | for (unsigned i = 1; i <= compute_count; ++i) { 20 | tmp *= i; 21 | } 22 | mem[id] = tmp; 23 | id += grid_size; 24 | } 25 | 26 | notifier->end(job_id); 27 | } 28 | 29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob { 30 | public: 31 | size_t get_input_size() override { 32 | return 5; 33 | } 34 | 35 | size_t get_output_size() override { 36 | return 11; 37 | } 38 | 39 | size_t get_param_size() override { 40 | return 4; 41 | } 42 | 43 | void one_time_init() override { 44 | set_num_threads_per_block(256); 45 | set_smem_size_per_block(0); 46 | set_num_registers_per_thread(32); 47 | set_num_blocks(5); 48 | unset_is_mem(); 49 | 50 | cudaMalloc(&mem_, count_ * sizeof(*mem_)); 51 | } 52 | 53 | void body(void* io_ptr) override { 54 | for (int i = 0; i < num_kernels; ++i) { 55 | if (i == num_kernels - 1) { 56 | set_pre_notify(); 57 | } 58 | yield(); 59 | llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier(); 60 | dummy_kernel<<>>(mem_, count_, compute_count_, get_id(), notifier); 61 | } 62 | } 63 | 64 | private: 65 | float* mem_; 66 | 67 | static constexpr unsigned count_ = 5000000; 68 | static constexpr unsigned compute_count_ = 100; 69 | static constexpr unsigned num_kernels = 11; 70 | }; 71 | 72 | extern "C" { 73 | 74 | __attribute__((visibility("default"))) 75 | llis::job::Job* init_job() { 76 | return new DummyShortCoroutineJob(); 77 | } 78 | 79 | } 80 | 81 | -------------------------------------------------------------------------------- /jobs/dummy_20/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_dummy_20 SHARED dummy.cu $) 2 | set_target_properties(job_dummy_20 PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_options(job_dummy_20 PRIVATE -fvisibility=hidden) 4 | target_link_libraries(job_dummy_20 llis_job_gpu llis_context) 5 | -------------------------------------------------------------------------------- /jobs/dummy_20/dummy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 9 | notifier->start(job_id); 10 | 11 | //clock_t start_time = clock64(); 12 | //while (clock64() - start_time < 10000000); 13 | 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | unsigned grid_size = blockDim.x * gridDim.x; 16 | 17 | while (id < count) { 18 | float tmp = 1; 19 | for (unsigned i = 1; i <= compute_count; ++i) { 20 | tmp *= i; 21 | } 22 | mem[id] = tmp; 23 | id += grid_size; 24 | } 25 | 26 | notifier->end(job_id); 27 | } 28 | 29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob { 30 | public: 31 | size_t get_input_size() override { 32 | return 5; 33 | } 34 | 35 | size_t get_output_size() override { 36 | return 11; 37 | } 38 | 39 | size_t get_param_size() override { 40 | return 4; 41 | } 42 | 43 | void one_time_init() override { 44 | set_num_threads_per_block(256); 45 | set_smem_size_per_block(0); 46 | set_num_registers_per_thread(32); 47 | set_num_blocks(5); 48 | unset_is_mem(); 49 | 50 | cudaMalloc(&mem_, count_ * sizeof(*mem_)); 51 | } 52 | 53 | void body(void* io_ptr) override { 54 | for (int i = 0; i < num_kernels; ++i) { 55 | if (i == num_kernels - 1) { 56 | set_pre_notify(); 57 | } 58 | yield(); 59 | llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier(); 60 | dummy_kernel<<>>(mem_, count_, compute_count_, get_id(), notifier); 61 | } 62 | } 63 | 64 | private: 65 | float* mem_; 66 | 67 | static constexpr unsigned count_ = 5000000; 68 | static constexpr unsigned compute_count_ = 100; 69 | static constexpr unsigned num_kernels = 20; 70 | }; 71 | 72 | extern "C" { 73 | 74 | __attribute__((visibility("default"))) 75 | llis::job::Job* init_job() { 76 | return new DummyShortCoroutineJob(); 77 | } 78 | 79 | } 80 | 81 | -------------------------------------------------------------------------------- /jobs/dummy_21/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_dummy_21 SHARED dummy.cu $) 2 | set_target_properties(job_dummy_21 PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_options(job_dummy_21 PRIVATE -fvisibility=hidden) 4 | target_link_libraries(job_dummy_21 llis_job_gpu llis_context) 5 | -------------------------------------------------------------------------------- /jobs/dummy_21/dummy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 9 | notifier->start(job_id); 10 | 11 | //clock_t start_time = clock64(); 12 | //while (clock64() - start_time < 10000000); 13 | 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | unsigned grid_size = blockDim.x * gridDim.x; 16 | 17 | while (id < count) { 18 | float tmp = 1; 19 | for (unsigned i = 1; i <= compute_count; ++i) { 20 | tmp *= i; 21 | } 22 | mem[id] = tmp; 23 | id += grid_size; 24 | } 25 | 26 | notifier->end(job_id); 27 | } 28 | 29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob { 30 | public: 31 | size_t get_input_size() override { 32 | return 5; 33 | } 34 | 35 | size_t get_output_size() override { 36 | return 11; 37 | } 38 | 39 | size_t get_param_size() override { 40 | return 4; 41 | } 42 | 43 | void one_time_init() override { 44 | set_num_threads_per_block(256); 45 | set_smem_size_per_block(0); 46 | set_num_registers_per_thread(32); 47 | set_num_blocks(5); 48 | unset_is_mem(); 49 | 50 | cudaMalloc(&mem_, count_ * sizeof(*mem_)); 51 | } 52 | 53 | void body(void* io_ptr) override { 54 | for (int i = 0; i < num_kernels; ++i) { 55 | if (i == num_kernels - 1) { 56 | set_pre_notify(); 57 | } 58 | yield(); 59 | llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier(); 60 | dummy_kernel<<>>(mem_, count_, compute_count_, get_id(), notifier); 61 | } 62 | } 63 | 64 | private: 65 | float* mem_; 66 | 67 | static constexpr unsigned count_ = 5000000; 68 | static constexpr unsigned compute_count_ = 100; 69 | static constexpr unsigned num_kernels = 21; 70 | }; 71 | 72 | extern "C" { 73 | 74 | __attribute__((visibility("default"))) 75 | llis::job::Job* init_job() { 76 | return new DummyShortCoroutineJob(); 77 | } 78 | 79 | } 80 | 81 | -------------------------------------------------------------------------------- /jobs/dummy_long/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_dummy_long SHARED dummy_long.cu $) 2 | set_target_properties(job_dummy_long PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_options(job_dummy_long PRIVATE -fvisibility=hidden) 4 | target_link_libraries(job_dummy_long llis_job_gpu llis_context) 5 | -------------------------------------------------------------------------------- /jobs/dummy_long/dummy_long.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void dummy_long(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 9 | notifier->start(job_id); 10 | 11 | //clock_t start_time = clock64(); 12 | //while (clock64() - start_time < 10000000); 13 | 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | unsigned grid_size = blockDim.x * gridDim.x; 16 | 17 | while (id < count) { 18 | float tmp = 1; 19 | for (unsigned i = 1; i <= compute_count; ++i) { 20 | tmp *= i; 21 | } 22 | mem[id] = tmp; 23 | id += grid_size; 24 | } 25 | 26 | notifier->end(job_id); 27 | } 28 | 29 | class DummyLongCoroutineJob : public llis::job::CoroutineJob { 30 | public: 31 | size_t get_input_size() override { 32 | return 5; 33 | } 34 | 35 | size_t get_output_size() override { 36 | return 11; 37 | } 38 | 39 | size_t get_param_size() override { 40 | return 4; 41 | } 42 | 43 | void one_time_init() override { 44 | set_num_threads_per_block(256); 45 | set_smem_size_per_block(0); 46 | set_num_registers_per_thread(32); 47 | set_num_blocks(5); 48 | unset_is_mem(); 49 | 50 | cudaMalloc(&mem_, count_ * sizeof(*mem_)); 51 | } 52 | 53 | void body(void* io_ptr) override { 54 | for (int i = 0; i < 50; ++i) { 55 | if (i == 49) { 56 | set_pre_notify(); 57 | } 58 | yield(); 59 | llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier(); 60 | dummy_long<<>>(mem_, count_, compute_count_, get_id(), notifier); 61 | } 62 | } 63 | 64 | private: 65 | float* mem_; 66 | 67 | static constexpr unsigned count_ = 5000000; 68 | static constexpr unsigned compute_count_ = 100; 69 | }; 70 | 71 | extern "C" { 72 | 73 | __attribute__((visibility("default"))) 74 | llis::job::Job* init_job() { 75 | return new DummyLongCoroutineJob(); 76 | } 77 | 78 | } 79 | 80 | -------------------------------------------------------------------------------- /jobs/dummy_short/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_dummy_short SHARED dummy_short.cu $) 2 | set_target_properties(job_dummy_short PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_options(job_dummy_short PRIVATE -fvisibility=hidden) 4 | target_link_libraries(job_dummy_short llis_job_gpu llis_context) 5 | -------------------------------------------------------------------------------- /jobs/dummy_short/dummy_short.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void dummy_short(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 9 | notifier->start(job_id); 10 | 11 | //clock_t start_time = clock64(); 12 | //while (clock64() - start_time < 10000000); 13 | 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | unsigned grid_size = blockDim.x * gridDim.x; 16 | 17 | while (id < count) { 18 | float tmp = 1; 19 | for (unsigned i = 1; i <= compute_count; ++i) { 20 | tmp *= i; 21 | } 22 | mem[id] = tmp; 23 | id += grid_size; 24 | } 25 | 26 | notifier->end(job_id); 27 | } 28 | 29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob { 30 | public: 31 | size_t get_input_size() override { 32 | return 5; 33 | } 34 | 35 | size_t get_output_size() override { 36 | return 11; 37 | } 38 | 39 | size_t get_param_size() override { 40 | return 4; 41 | } 42 | 43 | void one_time_init() override { 44 | set_num_threads_per_block(256); 45 | set_smem_size_per_block(0); 46 | set_num_registers_per_thread(32); 47 | set_num_blocks(5); 48 | unset_is_mem(); 49 | 50 | cudaMalloc(&mem_, count_ * sizeof(*mem_)); 51 | } 52 | 53 | void body(void* io_ptr) override { 54 | for (int i = 0; i < 10; ++i) { 55 | if (i == 9) { 56 | set_pre_notify(); 57 | } 58 | yield(); 59 | llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier(); 60 | dummy_short<<>>(mem_, count_, compute_count_, get_id(), notifier); 61 | } 62 | } 63 | 64 | private: 65 | float* mem_; 66 | 67 | static constexpr unsigned count_ = 5000000; 68 | static constexpr unsigned compute_count_ = 100; 69 | }; 70 | 71 | extern "C" { 72 | 73 | __attribute__((visibility("default"))) 74 | llis::job::Job* init_job() { 75 | return new DummyShortCoroutineJob(); 76 | } 77 | 78 | } 79 | 80 | -------------------------------------------------------------------------------- /jobs/helloworld/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_helloworld SHARED helloworld.cu $) 2 | set_target_properties(job_helloworld PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_libraries(job_helloworld llis_job_gpu llis_context) 4 | -------------------------------------------------------------------------------- /jobs/helloworld/helloworld.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void helloworld(int i, llis::JobId job_id, llis::ipc::Gpu2SchedChannel gpu2sched_channel 9 | #ifdef LLIS_MEASURE_BLOCK_TIME 10 | , llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel 11 | #endif 12 | ) { 13 | #ifdef LLIS_MEASURE_BLOCK_TIME 14 | llis::job::BlockStartEndTime start_end_time; 15 | llis::job::kernel_start(job_id, &gpu2sched_channel, &start_end_time); 16 | #else 17 | llis::job::kernel_start(job_id, &gpu2sched_channel); 18 | #endif 19 | 20 | unsigned nsmid; 21 | asm("mov.u32 %0, %nsmid;" : "=r"(nsmid)); 22 | printf("Hello world %d %d\n", i, nsmid); 23 | 24 | #ifdef LLIS_MEASURE_BLOCK_TIME 25 | llis::job::kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time); 26 | #else 27 | llis::job::kernel_end(job_id, &gpu2sched_channel); 28 | #endif 29 | } 30 | 31 | class HelloWorldJob : public llis::job::Job { 32 | public: 33 | size_t get_input_size() override { 34 | return 5; 35 | } 36 | 37 | size_t get_output_size() override { 38 | return 11; 39 | } 40 | 41 | size_t get_param_size() override { 42 | return 4; 43 | } 44 | 45 | void full_init(void* io_ptr) override { 46 | io_ptr_ = io_ptr; 47 | 48 | num_ = 0; 49 | 50 | set_num_blocks(1); 51 | set_num_threads_per_block(1); 52 | set_smem_size_per_block(0); 53 | set_num_registers_per_thread(32); 54 | } 55 | 56 | void init(void* io_ptr) override { 57 | io_ptr_ = io_ptr; 58 | 59 | num_ = 0; 60 | 61 | set_num_blocks(1); 62 | } 63 | 64 | void run_next() override { 65 | ++num_; 66 | 67 | helloworld<<>>(num_, get_id(), llis::job::Context::get_gpu2sched_channel()->fork() 68 | #ifdef LLIS_MEASURE_BLOCK_TIME 69 | , llis::job::Context::get_gpu2sched_block_time_channel()->fork() 70 | #endif 71 | ); 72 | 73 | set_num_blocks(num_ + 1); 74 | } 75 | 76 | bool has_next() const override { 77 | return num_ < 5; 78 | } 79 | 80 | private: 81 | void* io_ptr_; 82 | int num_ = 0; 83 | }; 84 | 85 | extern "C" { 86 | 87 | llis::job::Job* init_job() { 88 | return new HelloWorldJob(); 89 | } 90 | 91 | } 92 | 93 | -------------------------------------------------------------------------------- /jobs/helloworld_coroutine/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_helloworld_coroutine SHARED helloworld_coroutine.cu $) 2 | set_target_properties(job_helloworld_coroutine PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_libraries(job_helloworld_coroutine llis_job_gpu llis_context) 4 | -------------------------------------------------------------------------------- /jobs/helloworld_coroutine/helloworld_coroutine.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void helloworld(int i, llis::JobId job_id, llis::ipc::Gpu2SchedChannel gpu2sched_channel 9 | #ifdef LLIS_MEASURE_BLOCK_TIME 10 | , llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel 11 | #endif 12 | ) { 13 | #ifdef LLIS_MEASURE_BLOCK_TIME 14 | llis::job::BlockStartEndTime start_end_time; 15 | llis::job::kernel_start(job_id, &gpu2sched_channel, &start_end_time); 16 | #else 17 | llis::job::kernel_start(job_id, &gpu2sched_channel); 18 | #endif 19 | 20 | unsigned nsmid; 21 | asm("mov.u32 %0, %nsmid;" : "=r"(nsmid)); 22 | printf("Hello world %d %d\n", i, nsmid); 23 | 24 | #ifdef LLIS_MEASURE_BLOCK_TIME 25 | llis::job::kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time); 26 | #else 27 | llis::job::kernel_end(job_id, &gpu2sched_channel); 28 | #endif 29 | } 30 | 31 | class HelloWorldCoroutineJob : public llis::job::CoroutineJob { 32 | public: 33 | size_t get_input_size() override { 34 | return 5; 35 | } 36 | 37 | size_t get_output_size() override { 38 | return 11; 39 | } 40 | 41 | size_t get_param_size() override { 42 | return 4; 43 | } 44 | 45 | void one_time_init() override { 46 | set_num_threads_per_block(1); 47 | set_smem_size_per_block(0); 48 | set_num_registers_per_thread(32); 49 | } 50 | 51 | void body(void* io_ptr) override { 52 | io_ptr_ = io_ptr; 53 | 54 | for (int i = 1; i <= 5; ++i) { 55 | set_num_blocks(i); 56 | 57 | yield(); 58 | helloworld<<>>(i, get_id(), llis::job::Context::get_gpu2sched_channel()->fork() 59 | #ifdef LLIS_MEASURE_BLOCK_TIME 60 | , llis::job::Context::get_gpu2sched_block_time_channel()->fork() 61 | #endif 62 | ); 63 | } 64 | } 65 | 66 | private: 67 | void* io_ptr_; 68 | }; 69 | 70 | extern "C" { 71 | 72 | llis::job::Job* init_job() { 73 | return new HelloWorldCoroutineJob(); 74 | } 75 | 76 | } 77 | 78 | -------------------------------------------------------------------------------- /jobs/run_forever/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_run_forever SHARED run_forever.cu $) 2 | set_target_properties(job_run_forever PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_libraries(job_run_forever llis_job_gpu llis_context) 4 | 5 | -------------------------------------------------------------------------------- /jobs/run_forever/run_forever.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | __global__ void run(int n, llis::JobId job_id, llis::ipc::Gpu2SchedChannel gpu2sched_channel 9 | #ifdef LLIS_MEASURE_BLOCK_TIME 10 | , llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel 11 | #endif 12 | ) { 13 | #ifdef LLIS_MEASURE_BLOCK_TIME 14 | llis::job::BlockStartEndTime start_end_time; 15 | llis::job::kernel_start(job_id, &gpu2sched_channel, &start_end_time); 16 | #else 17 | llis::job::kernel_start(job_id, &gpu2sched_channel); 18 | #endif 19 | 20 | printf("run_forever %u\n", job_id); 21 | 22 | if (n < 5) { 23 | volatile int i; 24 | for (i = 0; i < n * 1000; ++i); 25 | } else { 26 | while (true); 27 | } 28 | 29 | #ifdef LLIS_MEASURE_BLOCK_TIME 30 | llis::job::kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time); 31 | #else 32 | llis::job::kernel_end(job_id, &gpu2sched_channel); 33 | #endif 34 | } 35 | 36 | class RunForeverJob : public llis::job::Job { 37 | public: 38 | size_t get_input_size() override { 39 | return 5; 40 | } 41 | 42 | size_t get_output_size() override { 43 | return 11; 44 | } 45 | 46 | size_t get_param_size() override { 47 | return 4; 48 | } 49 | 50 | void full_init(void* io_ptr) override { 51 | io_ptr_ = io_ptr; 52 | 53 | num_ = 0; 54 | 55 | set_num_blocks(1); 56 | set_num_threads_per_block(1); 57 | set_smem_size_per_block(0); 58 | set_num_registers_per_thread(32); 59 | } 60 | 61 | void init(void* io_ptr) override { 62 | io_ptr_ = io_ptr; 63 | 64 | num_ = 0; 65 | 66 | set_num_blocks(1); 67 | } 68 | 69 | void run_next() override { 70 | ++num_; 71 | 72 | run<<>>(num_, get_id(), llis::job::Context::get_gpu2sched_channel()->fork() 73 | #ifdef LLIS_MEASURE_BLOCK_TIME 74 | , llis::job::Context::get_gpu2sched_block_time_channel()->fork() 75 | #endif 76 | ); 77 | 78 | set_num_blocks(num_ + 1); 79 | } 80 | 81 | bool has_next() const override { 82 | return num_ < 5; 83 | } 84 | 85 | private: 86 | void* io_ptr_; 87 | int num_ = 0; 88 | }; 89 | 90 | extern "C" { 91 | 92 | llis::job::Job* init_job() { 93 | return new RunForeverJob(); 94 | } 95 | 96 | } 97 | 98 | 99 | -------------------------------------------------------------------------------- /jobs/tvm_arcfaceresnet100/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_arcfaceresnet100 SHARED tvm_arcfaceresnet100.cpp) 2 | target_link_libraries(job_tvm_arcfaceresnet100 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_arcfaceresnet100 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_arcfaceresnet100/tvm_arcfaceresnet100.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMArcfaceresnet100 : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 112*112*3 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 512 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "arcfaceresnet100-8-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input(0); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMArcfaceresnet100(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/tvm_densenet121/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_densenet121 SHARED tvm_densenet121.cpp) 2 | target_link_libraries(job_tvm_densenet121 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_densenet121 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_densenet121/tvm_densenet121.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMDensenet121Job : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 224*224*3 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 1000 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "densenet-9-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input(0); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMDensenet121Job(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/tvm_googlenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_googlenet SHARED tvm_googlenet.cpp) 2 | target_link_libraries(job_tvm_googlenet tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_googlenet DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_googlenet/tvm_googlenet.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMGooglenetJob : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 224*224*3 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 1000 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "googlenet-9-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input(0); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMGooglenetJob(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/tvm_inception_v3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_inception_v3 SHARED tvm_inception_v3.cpp) 2 | target_link_libraries(job_tvm_inception_v3 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_inception_v3 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_inception_v3/tvm_inception_v3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | class TVMInceptionV3Job : public llis::job::CoroutineJob { 10 | public: 11 | size_t get_input_size() override { 12 | return 224*224*3 * sizeof(float); 13 | } 14 | 15 | size_t get_output_size() override { 16 | return 1000 * sizeof(float); 17 | } 18 | 19 | size_t get_param_size() override { 20 | return 4; 21 | } 22 | 23 | void one_time_init() override { 24 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 25 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "inception_v3-cuda_llis-pack.so")); 26 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 27 | run_ = gmod_.GetFunction("run"); 28 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 29 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 30 | input_dev = get_input("input_1"); 31 | output_dev = get_output(0); 32 | } 33 | 34 | void body(void* io_ptr) override { 35 | set_is_mem(); 36 | yield(); 37 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 38 | unset_is_mem(); 39 | 40 | run_(); 41 | 42 | set_is_mem(); 43 | set_pre_notify(); 44 | yield(); 45 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 46 | } 47 | 48 | private: 49 | DLDevice ctx_gpu_; 50 | tvm::runtime::Module mod_factory_; 51 | tvm::runtime::Module gmod_; 52 | tvm::runtime::PackedFunc run_; 53 | tvm::runtime::NDArray input_dev; 54 | tvm::runtime::NDArray output_dev; 55 | }; 56 | 57 | extern "C" { 58 | 59 | llis::job::Job* init_job() { 60 | return new TVMInceptionV3Job(); 61 | } 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /jobs/tvm_mnist/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_mnist SHARED tvm_mnist.cpp) 2 | target_link_libraries(job_tvm_mnist tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_mnist DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_mnist/tvm_mnist.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMMnistJob : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 28*28 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 10 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "mnist-8-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input("Input3"); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMMnistJob(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/tvm_mobilenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_mobilenet SHARED tvm_mobilenet.cpp) 2 | target_link_libraries(job_tvm_mobilenet tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_mobilenet DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_mobilenet/tvm_mobilenet.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMMobilenetJob : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 224*224*3 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 1000 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "mobilenetv2-7-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input(0); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMMobilenetJob(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/tvm_resnet18/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_resnet18 SHARED tvm_resnet18.cpp) 2 | target_link_libraries(job_tvm_resnet18 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_resnet18 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_resnet18/tvm_resnet18.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | class TVMResnet18Job : public llis::job::CoroutineJob { 10 | public: 11 | size_t get_input_size() override { 12 | return 224*224*3 * sizeof(float); 13 | } 14 | 15 | size_t get_output_size() override { 16 | return 1000 * sizeof(float); 17 | } 18 | 19 | size_t get_param_size() override { 20 | return 4; 21 | } 22 | 23 | void one_time_init() override { 24 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 25 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "resnet18-v2-7-cuda_llis-pack.so")); 26 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 27 | run_ = gmod_.GetFunction("run"); 28 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 29 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 30 | input_dev = get_input(0); 31 | output_dev = get_output(0); 32 | } 33 | 34 | void body(void* io_ptr) override { 35 | set_is_mem(); 36 | yield(); 37 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 38 | unset_is_mem(); 39 | 40 | run_(); 41 | 42 | set_is_mem(); 43 | set_pre_notify(); 44 | yield(); 45 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 46 | } 47 | 48 | private: 49 | DLDevice ctx_gpu_; 50 | tvm::runtime::Module mod_factory_; 51 | tvm::runtime::Module gmod_; 52 | tvm::runtime::PackedFunc run_; 53 | tvm::runtime::NDArray input_dev; 54 | tvm::runtime::NDArray output_dev; 55 | }; 56 | 57 | extern "C" { 58 | 59 | llis::job::Job* init_job() { 60 | return new TVMResnet18Job(); 61 | } 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /jobs/tvm_resnet34/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_resnet34 SHARED tvm_resnet34.cpp) 2 | target_link_libraries(job_tvm_resnet34 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_resnet34 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_resnet34/tvm_resnet34.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | class TVMResnet34Job : public llis::job::CoroutineJob { 10 | public: 11 | size_t get_input_size() override { 12 | return 224*224*3 * sizeof(float); 13 | } 14 | 15 | size_t get_output_size() override { 16 | return 1000 * sizeof(float); 17 | } 18 | 19 | size_t get_param_size() override { 20 | return 4; 21 | } 22 | 23 | void one_time_init() override { 24 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 25 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "resnet34-v2-7-cuda_llis-pack.so")); 26 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 27 | run_ = gmod_.GetFunction("run"); 28 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 29 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 30 | input_dev = get_input(0); 31 | output_dev = get_output(0); 32 | } 33 | 34 | void body(void* io_ptr) override { 35 | set_is_mem(); 36 | yield(); 37 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 38 | unset_is_mem(); 39 | 40 | run_(); 41 | 42 | set_is_mem(); 43 | set_pre_notify(); 44 | yield(); 45 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 46 | } 47 | 48 | private: 49 | DLDevice ctx_gpu_; 50 | tvm::runtime::Module mod_factory_; 51 | tvm::runtime::Module gmod_; 52 | tvm::runtime::PackedFunc run_; 53 | tvm::runtime::NDArray input_dev; 54 | tvm::runtime::NDArray output_dev; 55 | }; 56 | 57 | extern "C" { 58 | 59 | llis::job::Job* init_job() { 60 | return new TVMResnet34Job(); 61 | } 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /jobs/tvm_resnet50/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_resnet50 SHARED tvm_resnet50.cpp) 2 | target_link_libraries(job_tvm_resnet50 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_resnet50 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_resnet50/tvm_resnet50.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMResnet50 : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 224*224*3 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 1000 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "resnet50-v2-7-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input(0); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMResnet50(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/tvm_squeezenet1_1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_squeezenet1_1 SHARED tvm_squeezenet1_1.cpp) 2 | target_link_libraries(job_tvm_squeezenet1_1 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_squeezenet1_1 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_squeezenet1_1/tvm_squeezenet1_1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | class TVMSqueezeNet11Job : public llis::job::CoroutineJob { 10 | public: 11 | size_t get_input_size() override { 12 | return 224*224*3 * sizeof(float); 13 | } 14 | 15 | size_t get_output_size() override { 16 | return 1000 * sizeof(float); 17 | } 18 | 19 | size_t get_param_size() override { 20 | return 4; 21 | } 22 | 23 | void one_time_init() override { 24 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 25 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "squeezenet1.1-7-cuda_llis-pack.so")); 26 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 27 | run_ = gmod_.GetFunction("run"); 28 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 29 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 30 | input_dev = get_input(0); 31 | output_dev = get_output(0); 32 | } 33 | 34 | void body(void* io_ptr) override { 35 | set_is_mem(); 36 | yield(); 37 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 38 | unset_is_mem(); 39 | 40 | run_(); 41 | 42 | set_is_mem(); 43 | set_pre_notify(); 44 | yield(); 45 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 46 | } 47 | 48 | private: 49 | DLDevice ctx_gpu_; 50 | tvm::runtime::Module mod_factory_; 51 | tvm::runtime::Module gmod_; 52 | tvm::runtime::PackedFunc run_; 53 | tvm::runtime::NDArray input_dev; 54 | tvm::runtime::NDArray output_dev; 55 | }; 56 | 57 | extern "C" { 58 | 59 | llis::job::Job* init_job() { 60 | return new TVMSqueezeNet11Job(); 61 | } 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /jobs/tvm_ultraface320/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_tvm_ultraface320 SHARED tvm_ultraface320.cpp) 2 | target_link_libraries(job_tvm_ultraface320 tvm::tvm_runtime llis_job llis_context) 3 | install(TARGETS job_tvm_ultraface320 DESTINATION lib/llis_jobs) 4 | 5 | -------------------------------------------------------------------------------- /jobs/tvm_ultraface320/tvm_ultraface320.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | class TVMUltraface320Job : public llis::job::CoroutineJob { 11 | public: 12 | size_t get_input_size() override { 13 | return 240*320*3 * sizeof(float); 14 | } 15 | 16 | size_t get_output_size() override { 17 | return 4420*2 * sizeof(float); 18 | } 19 | 20 | size_t get_param_size() override { 21 | return 4; 22 | } 23 | 24 | void one_time_init() override { 25 | ctx_gpu_ = DLDevice{kDLCUDA, 0}; 26 | mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "ultraface320-cuda_llis-pack.so")); 27 | gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_); 28 | run_ = gmod_.GetFunction("run"); 29 | tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input"); 30 | tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output"); 31 | input_dev = get_input(0); 32 | output_dev = get_output(0); 33 | } 34 | 35 | void body(void* io_ptr) override { 36 | set_is_mem(); 37 | yield(); 38 | cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream()); 39 | unset_is_mem(); 40 | 41 | run_(); 42 | 43 | set_is_mem(); 44 | set_pre_notify(); 45 | yield(); 46 | cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream()); 47 | } 48 | 49 | private: 50 | DLDevice ctx_gpu_; 51 | tvm::runtime::Module mod_factory_; 52 | tvm::runtime::Module gmod_; 53 | tvm::runtime::PackedFunc run_; 54 | tvm::runtime::NDArray input_dev; 55 | tvm::runtime::NDArray output_dev; 56 | }; 57 | 58 | extern "C" { 59 | 60 | llis::job::Job* init_job() { 61 | return new TVMUltraface320Job(); 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /jobs/vec_add_coroutine/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(job_vec_add_coroutine SHARED vec_add_coroutine.cu $) 2 | set_target_properties(job_vec_add_coroutine PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_libraries(job_vec_add_coroutine llis_job_gpu llis_context) 4 | -------------------------------------------------------------------------------- /sosp23_artifact/gen_data_fig11_cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | install_path=/bigdisk/opt/llis 4 | models_dir=/bigdisk/models/cuda 5 | res_dir=/bigdisk/results-cuda 6 | 7 | while getopts 'p:m:o:' opt; do 8 | case "$opt" in 9 | p) 10 | install_path="$OPTARG" 11 | ;; 12 | 13 | m) 14 | models_dir="$OPTARG" 15 | ;; 16 | 17 | o) 18 | res_dir="$OPTARG" 19 | ;; 20 | 21 | ?|h) 22 | echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]" 23 | exit 1 24 | ;; 25 | esac 26 | done 27 | shift "$(($OPTIND -1))" 28 | 29 | export CUDA_DEVICE_MAX_CONNECTIONS=32 30 | 31 | mkdir -p "${res_dir}" 32 | 33 | for ln_sigma in {1.5,2}; do 34 | for sched in {CUDA-SS,CUDA-MS}; do 35 | case $sched in 36 | CUDA-SS) 37 | num_streams=1 38 | suffix=_cudass 39 | ;; 40 | CUDA-MS) 41 | num_streams=141 42 | suffix=_cudams 43 | ;; 44 | esac 45 | 46 | for i in {1000,1053,1111,1176,1250,1333,1429,1538,1667,1818,2000,2222,2500,2857,3333,4000,5000,6667,10000,20000,40000,80000,160000}; do 47 | "${install_path}"/bin/tvm_direct_multistream \ 48 | --iat $i \ 49 | --ln_sigma $ln_sigma \ 50 | --start_record_num 0 \ 51 | --seed 1 \ 52 | --prefix "${res_dir}/all_prop${suffix}" \ 53 | --iat_n \ 54 | --iat_g \ 55 | --ln_sigma_n \ 56 | --num_jobs 3000 \ 57 | --concurrency $num_streams \ 58 | "${models_dir}/mobilenetv2-7-cuda-pack.so" 0.257 36 \ 59 | "${models_dir}/densenet-9-cuda-pack.so" 0.0706 10 \ 60 | "${models_dir}/googlenet-9-cuda-pack.so" 0.0546 8 \ 61 | "${models_dir}/inception_v3-cuda-pack.so" 0.0138 2 \ 62 | "${models_dir}/resnet18-v2-7-cuda-pack.so" 0.272 38 \ 63 | "${models_dir}/resnet34-v2-7-cuda-pack.so" 0.168 24 \ 64 | "${models_dir}/resnet50-v2-7-cuda-pack.so" 0.0745 10 \ 65 | "${models_dir}/squeezenet1.1-7-cuda-pack.so" 0.0894999999999999 13 66 | done 67 | done 68 | done 69 | 70 | -------------------------------------------------------------------------------- /sosp23_artifact/gen_data_fig11_triton.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | client_path=/bigdisk/src/triton-client-llis 4 | res_dir=/bigdisk/results-triton/ 5 | 6 | while getopts 'p:o:' opt; do 7 | case "$opt" in 8 | p) 9 | client_path="$OPTARG" 10 | ;; 11 | 12 | o) 13 | res_dir="$OPTARG" 14 | ;; 15 | 16 | ?|h) 17 | echo "Usage: $(basename $0) [-p client_path] [-o output_dir]" 18 | exit 1 19 | ;; 20 | esac 21 | done 22 | shift "$(($OPTIND -1))" 23 | 24 | mkdir -p $res_dir/1.5 25 | mkdir -p $res_dir/2 26 | 27 | $client_path/run.py -b $client_path/build/cc-clients/examples/grpc_async_infer_client_mixed -o $res_dir/ 10 50 10 $client_path/schedules/newmix3_sops23.yaml 28 | 29 | -------------------------------------------------------------------------------- /sosp23_artifact/gen_data_fig12_cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | install_path=/bigdisk/opt/llis 4 | models_dir=/bigdisk/models/cuda 5 | res_dir=/bigdisk/results-cuda 6 | 7 | while getopts 'p:m:o:' opt; do 8 | case "$opt" in 9 | p) 10 | install_path="$OPTARG" 11 | ;; 12 | 13 | m) 14 | models_dir="$OPTARG" 15 | ;; 16 | 17 | o) 18 | res_dir="$OPTARG" 19 | ;; 20 | 21 | ?|h) 22 | echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]" 23 | exit 1 24 | ;; 25 | esac 26 | done 27 | shift "$(($OPTIND -1))" 28 | 29 | export CUDA_DEVICE_MAX_CONNECTIONS=32 30 | 31 | mkdir -p "${res_dir}" 32 | 33 | for ln_sigma in {1.5,2}; do 34 | for sched in {CUDA-SS,CUDA-MS}; do 35 | case $sched in 36 | CUDA-SS) 37 | num_streams=1 38 | suffix=_cudass 39 | ;; 40 | CUDA-MS) 41 | num_streams=125 42 | suffix=_cudams 43 | ;; 44 | esac 45 | 46 | for i in {2000,2222,2500,2857,3333,4000,5000,6667,10000,20000,40000}; do 47 | "${install_path}"/bin/tvm_direct_multistream \ 48 | --iat $i \ 49 | --ln_sigma $ln_sigma \ 50 | --start_record_num 0 \ 51 | --seed 1 \ 52 | --prefix "${res_dir}/resnet18_inception_v3_prop${suffix}" \ 53 | --iat_n \ 54 | --iat_g \ 55 | --ln_sigma_n \ 56 | --num_jobs 3000 \ 57 | --concurrency $num_streams \ 58 | "${models_dir}/resnet18-v2-7-cuda-pack.so" 0.952 119 \ 59 | "${models_dir}/inception_v3-cuda-pack.so" 0.048 6 60 | done 61 | done 62 | done 63 | -------------------------------------------------------------------------------- /sosp23_artifact/gen_data_fig12_paella.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | install_path=/bigdisk/opt/llis 4 | export LLIS_MODELS_DIR=/bigdisk/models/cuda_llis 5 | res_dir=/bigdisk/results 6 | 7 | while getopts 'p:m:o:' opt; do 8 | case "$opt" in 9 | p) 10 | install_path="$OPTARG" 11 | ;; 12 | 13 | m) 14 | export LLIS_MODELS_DIR="$OPTARG" 15 | ;; 16 | 17 | o) 18 | res_dir="$OPTARG" 19 | ;; 20 | 21 | ?|h) 22 | echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]" 23 | exit 1 24 | ;; 25 | esac 26 | done 27 | shift "$(($OPTIND -1))" 28 | 29 | export CUDA_DEVICE_MAX_CONNECTIONS=32 30 | 31 | mkdir -p "${res_dir}" 32 | 33 | SERVER_PID=0 34 | 35 | trap "kill $SERVER_PID; exit" INT 36 | 37 | for ln_sigma in {1.5,2}; do 38 | for sched in {SS,MS-jbj,MS-kbk,Full}; do 39 | case $sched in 40 | SS) 41 | sched=fifo 42 | num_streams=1 43 | suffix=_ss 44 | ;; 45 | MS-jbj) 46 | sched=fifo 47 | num_streams=500 48 | suffix=_msjbj 49 | ;; 50 | MS-kbk) 51 | sched=fifo2 52 | num_streams=500 53 | suffix=_mskbk 54 | ;; 55 | Full) 56 | sched=full3 57 | num_streams=500 58 | suffix=_full 59 | ;; 60 | esac 61 | 62 | for i in {2222,2500,2857,3333,4000,5000,6667,10000,20000,40000}; do 63 | taskset -c 4 "${install_path}"/bin/llis_server \ 64 | --name server \ 65 | --sched $sched \ 66 | --num_streams $num_streams & 67 | SERVER_PID=$! 68 | sleep 5 69 | 70 | "${install_path}"/bin/llis_app_client \ 71 | --server_name server \ 72 | --iat $i \ 73 | --ln_sigma $ln_sigma \ 74 | --start_record_num 0 \ 75 | --seed 1 \ 76 | --prefix "${res_dir}/resnet18_inception_v3_prop${suffix}" \ 77 | --fairness 1000000 \ 78 | --iat_n \ 79 | --iat_g \ 80 | --ln_sigma_n \ 81 | --num_jobs 3000 \ 82 | --concurrency 125 \ 83 | "${install_path}/lib/llis_jobs/libjob_tvm_resnet18.so" 0.952 119 \ 84 | "${install_path}/lib/llis_jobs/libjob_tvm_inception_v3.so" 0.048 6 85 | wait 86 | done 87 | done 88 | done 89 | -------------------------------------------------------------------------------- /sosp23_artifact/gen_data_fig13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | install_path=/bigdisk/opt/llis 4 | export LLIS_MODELS_DIR=/bigdisk/models/cuda_llis 5 | res_dir=/bigdisk/results 6 | 7 | while getopts 'p:m:o:' opt; do 8 | case "$opt" in 9 | p) 10 | install_path="$OPTARG" 11 | ;; 12 | 13 | m) 14 | export LLIS_MODELS_DIR="$OPTARG" 15 | ;; 16 | 17 | o) 18 | res_dir="$OPTARG" 19 | ;; 20 | 21 | ?|h) 22 | echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]" 23 | exit 1 24 | ;; 25 | esac 26 | done 27 | shift "$(($OPTIND -1))" 28 | 29 | export CUDA_DEVICE_MAX_CONNECTIONS=32 30 | 31 | mkdir -p "${res_dir}" 32 | 33 | SERVER_PID=0 34 | 35 | trap "kill $SERVER_PID; exit" INT 36 | 37 | for f in {0.01,0.1,1,10,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000,10000,100000}; do 38 | taskset -c 4 "${install_path}"/bin/llis_server \ 39 | --name server \ 40 | --sched full3 \ 41 | --unfair $f \ 42 | --num_streams 500 & 43 | SERVER_PID=$! 44 | sleep 5 45 | 46 | "${install_path}"/bin/llis_app_client \ 47 | --server_name server \ 48 | --iat 0 \ 49 | --ln_sigma 2 \ 50 | --start_record_num 0 \ 51 | --seed 1 \ 52 | --prefix "${res_dir}/resnet18_inception_v3_prop_full_fairness" \ 53 | --fairness $f \ 54 | --fairness_n \ 55 | --fairness_g \ 56 | --ln_sigma_n \ 57 | --num_jobs 3000 \ 58 | --concurrency 125 \ 59 | "${install_path}/lib/llis_jobs/libjob_tvm_resnet18.so" 0.952 119 \ 60 | "${install_path}/lib/llis_jobs/libjob_tvm_inception_v3.so" 0.048 6 61 | wait 62 | done 63 | 64 | -------------------------------------------------------------------------------- /sosp23_artifact/plot_fig13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0")" 4 | script_path=`pwd` 5 | cd - 6 | 7 | paella_res_dir=/bigdisk/results 8 | output_dir=/bigdisk/graphs 9 | 10 | while getopts 'p:c:t:o:' opt; do 11 | case "$opt" in 12 | p) 13 | paella_res_dir="$OPTARG" 14 | ;; 15 | 16 | o) 17 | output_dir="$OPTARG" 18 | ;; 19 | 20 | ?|h) 21 | echo "Usage: $(basename $0) [-p paella_res_dir] [-o output_dir]" 22 | exit 1 23 | ;; 24 | esac 25 | done 26 | shift "$(($OPTIND -1))" 27 | 28 | mkdir -p ${output_dir} 29 | 30 | python3 $script_path/tools/plot_latency_fairness_threshold.py \ 31 | -o $output_dir/fig13.pdf \ 32 | -i $paella_res_dir/resnet18_inception_v3_prop_full_fairness_lns2.txt \ 33 | -a 'Paella' \ 34 | -l 1 -n ResNet-18 \ 35 | -l 2 -n InceptionV3 \ 36 | --yaxis Mean 37 | 38 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | *Note that you should skip this part if you are using the machine we provided, as it already has the environment set up.* 4 | 5 | The following scripts install stuff in the `/bigdisk` directory by default. If you want to change the installation prefix, you can use the `-p` argument (e.g., `./install_dependencies.sh -p /my_dir`). 6 | 7 | First, please download the onnx models from [here](https://drive.google.com/file/d/1AAI1lwGBT6CnLx_q24z8vhqKh1g8mT3g/view?usp=sharing) and extract it to `/models` (`/bigdisk/models` by default). The setup scripts below depend on the onnx models. 8 | 9 | Then, `cd` into the `setup/` directory if you have not done so yet. 10 | 11 | 1. Run `./install_dependencies.sh` to setup the environment. 12 | 13 | 1. Run `./install_llis_tvm.sh`, which install Paella and the custom TVM modified for Paella. Also, it will compile the models with TVM. 14 | 15 | 1. Run `./build_triton_docker.sh` to build the docker with Triton server. 16 | 17 | 1. Run `./install_triton_client.sh` to install the Triton client. 18 | 19 | *After running all scripts, either `source ~/.bash_profile` or logout and then log back in to ensure that the environment variables are set.* 20 | 21 | ## Reset 22 | 23 | We have already done the setup process on the machine we provided. However, if you want to start from scratch, run `./reset_all.sh`. 24 | 25 | If you only want to install Paella and the custom TVM from scratch, while keeping other dependencies untouched, run `./reset_llis_tvm.sh` and then run `./install_llis_tvm.sh` again. 26 | 27 | *Either `source ~/.bash_profile` or logout and then log back in to ensure that the environment variables are set.* 28 | 29 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/build_triton_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PREFIX=/bigdisk 4 | 5 | while getopts 'p:' opt; do 6 | case "$opt" in 7 | p) 8 | PREFIX="$OPTARG" 9 | ;; 10 | 11 | ?|h) 12 | echo "Usage: $(basename $0) [-p PREFIX]" 13 | exit 1 14 | ;; 15 | esac 16 | done 17 | 18 | source /etc/profile 19 | source ~/.bash_profile 20 | 21 | cd "$(dirname "$0")" 22 | abs_path="`pwd`" 23 | 24 | # Get TVM source 25 | 26 | cd "${PREFIX}/src" 27 | 28 | git clone --recursive https://github.com/eniac/tvm-llis.git tvm-tf 29 | cd tvm-tf 30 | git checkout v0.10.0-llis 31 | git submodule update --recursive 32 | 33 | # Compile TVM for TF and Convert TVM models to TF models 34 | 35 | sudo docker run --gpus=1 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ 36 | -v${abs_path}/..:/workspace/sosp23_artifact \ 37 | -v"${PREFIX}"/models:/workspace/models \ 38 | -v"${PREFIX}"/src/tvm-tf:/workspace/src/tvm-tf \ 39 | nvcr.io/nvidia/tensorflow:23.03-tf2-py3 \ 40 | /workspace/sosp23_artifact/setup/triton_docker/run_on_tf_docker.sh # Use convert_tvm_to_tf.sh instead if building TVM is not necessary 41 | 42 | sudo rsync -a ${abs_path}/../tf_models_config/ ${PREFIX}/models/tensorflow/ 43 | 44 | # Build docker image 45 | 46 | cd ${abs_path}/triton_docker 47 | 48 | mkdir -p models 49 | sudo mount --bind ${PREFIX}/models models 50 | 51 | mkdir -p sosp23_artifact 52 | sudo mount --bind ../../../sosp23_artifact sosp23_artifact 53 | 54 | mkdir -p tvm-tf 55 | sudo mount --bind ${PREFIX}/src/tvm-tf tvm-tf 56 | 57 | sudo docker build -t triton_server_tvm . 58 | 59 | sudo umount models 60 | sudo umount sosp23_artifact 61 | sudo umount tvm-tf 62 | 63 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/install_triton_client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PREFIX=/bigdisk 4 | 5 | while getopts 'p:' opt; do 6 | case "$opt" in 7 | p) 8 | PREFIX="$OPTARG" 9 | ;; 10 | 11 | ?|h) 12 | echo "Usage: $(basename $0) [-p PREFIX]" 13 | exit 1 14 | ;; 15 | esac 16 | done 17 | 18 | source /etc/profile 19 | source ~/.bash_profile 20 | 21 | sudo chown $USER "${PREFIX}" 22 | 23 | mkdir -p "${PREFIX}/src" 24 | 25 | cd "${PREFIX}/src" 26 | 27 | sudo apt-get install -y curl libcurl4-openssl-dev libb64-dev libssl-dev zlib1g-dev rapidjson-dev libopencv-dev libyaml-cpp-dev 28 | pip install PyYAML 29 | 30 | git clone https://github.com/maxdml/triton-client.git triton-client-llis 31 | cd triton-client-llis 32 | git checkout sosp23_artifact 33 | mkdir build 34 | cd build 35 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=OFF -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON .. 36 | make -j40 37 | 38 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/onnx2tvm.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import numpy as np 3 | import tvm 4 | from tvm import te 5 | import tvm.relay as relay 6 | from tvm.contrib.download import download_testdata 7 | import sys 8 | 9 | model_path = sys.argv[1] # path to the onnx model 10 | output_path = sys.argv[2] # path to output the tvm-compiled model 11 | target = sys.argv[3] # cuda or cuda -llis_flag=[1|3|5] 12 | input_dims = tuple(int(x) for x in sys.argv[4:]) # e.g., `1 3 224 224` for mobilenet 13 | 14 | onnx_model = onnx.load(model_path) 15 | 16 | input_names_all = [node.name for node in onnx_model.graph.input] 17 | input_initializer = [node.name for node in onnx_model.graph.initializer] 18 | input_names = list(set(input_names_all) - set(input_initializer)) 19 | 20 | input_name = input_names[0] 21 | 22 | shape_dict = {input_name: input_dims} 23 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) 24 | 25 | opt_level = 3 26 | with tvm.transform.PassContext(opt_level=opt_level): 27 | lib = relay.build(mod, target, params=params) 28 | 29 | lib.export_library(output_path) 30 | 31 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/onnx2tvm_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ONNX_DIR=$1 4 | DEST_DIR=$2 5 | TYPE=$3 6 | 7 | if [[ $TYPE == "cuda_llis" ]]; then 8 | TYPE2='cuda -llis_flag=3' 9 | else 10 | TYPE2='cuda' 11 | fi 12 | 13 | mkdir -p $DEST_DIR 14 | 15 | python3 onnx2tvm.py ${ONNX_DIR}/mnist-8.onnx ${DEST_DIR}/mnist-8-${TYPE}-pack.so "${TYPE2}" 1 1 28 28 16 | python3 onnx2tvm.py ${ONNX_DIR}/densenet-9.onnx ${DEST_DIR}/densenet-9-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 17 | python3 onnx2tvm.py ${ONNX_DIR}/googlenet-9.onnx ${DEST_DIR}/googlenet-9-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 18 | python3 onnx2tvm.py ${ONNX_DIR}/mobilenetv2-7.onnx ${DEST_DIR}/mobilenetv2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 19 | python3 onnx2tvm.py ${ONNX_DIR}/resnet18-v2-7.onnx ${DEST_DIR}/resnet18-v2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 20 | python3 onnx2tvm.py ${ONNX_DIR}/resnet34-v2-7.onnx ${DEST_DIR}/resnet34-v2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 21 | python3 onnx2tvm.py ${ONNX_DIR}/resnet50-v2-7.onnx ${DEST_DIR}/resnet50-v2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 22 | python3 onnx2tvm.py ${ONNX_DIR}/squeezenet1.1-7.onnx ${DEST_DIR}/squeezenet1.1-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 23 | python3 onnx2tvm.py ${ONNX_DIR}/inception_v3.onnx ${DEST_DIR}/inception_v3-${TYPE}-pack.so "${TYPE2}" 1 3 224 224 24 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/reset_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PREFIX=/bigdisk 4 | 5 | while getopts 'p:' opt; do 6 | case "$opt" in 7 | p) 8 | PREFIX="$OPTARG" 9 | ;; 10 | 11 | ?|h) 12 | echo "Usage: $(basename $0) [-p PREFIX]" 13 | exit 1 14 | ;; 15 | esac 16 | done 17 | 18 | rm -rf ${PREFIX}/src 19 | rm -rf ${PREFIX}/opt 20 | rm -rf ${PREFIX}/results 21 | rm -rf ${PREFIX}/results-triton 22 | rm -rf ${PREFIX}/results-cuda 23 | rm -rf ${PREFIX}/results-mps 24 | rm -rf ${PREFIX}/graphs 25 | rm -rf ${PREFIX}/models/cuda 26 | rm -rf ${PREFIX}/models/cuda_llis 27 | rm -rf ${PREFIX}/models/tensorflow 28 | rm ~/.bash_profile 29 | 30 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/reset_llis_tvm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PREFIX=/bigdisk 4 | 5 | while getopts 'p:' opt; do 6 | case "$opt" in 7 | p) 8 | PREFIX="$OPTARG" 9 | ;; 10 | 11 | ?|h) 12 | echo "Usage: $(basename $0) [-p PREFIX]" 13 | exit 1 14 | ;; 15 | esac 16 | done 17 | 18 | rm -rf ${PREFIX}/src/llis 19 | rm -rf ${PREFIX}/src/tvm-llis 20 | 21 | rm -rf ${PREFIX}/opt/llis 22 | rm -rf ${PREFIX}/opt/tvm-llis 23 | 24 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/triton_docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/tritonserver:23.03-py3 2 | 3 | COPY sosp23_artifact /workspace/sosp23_artifact 4 | COPY models/cuda /workspace/models/cuda 5 | COPY models/tensorflow /workspace/models/tensorflow 6 | COPY tvm-tf /workspace/src/tvm-tf 7 | 8 | RUN /workspace/sosp23_artifact/setup/triton_docker/setup.sh 9 | 10 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/triton_docker/build_tvm_tf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | apt update 4 | apt install -y cmake clang 5 | 6 | cd /workspace/src/tvm-tf 7 | 8 | mkdir build 9 | cd build 10 | cp ../cmake/config.cmake . 11 | sed -i 's/set(USE_CUDA OFF)/set(USE_CUDA ON)/' config.cmake 12 | sed -i 's/set(USE_LLVM OFF)/set(USE_LLVM ON)/' config.cmake 13 | sed -i 's/set(USE_TF_TVMDSOOP OFF)/set(USE_TF_TVMDSOOP ON)/' config.cmake 14 | cmake .. 15 | make -j40 16 | 17 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/triton_docker/convert_tvm_to_tf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export TVM_HOME=/workspace/src/tvm-tf 4 | export PYTHONPATH=${TVM_HOME}/python:${PYTHONPATH} 5 | export LD_LIBRARY_PATH=${TVM_HOME}/build:$LD_LIBRARY_PATH 6 | 7 | apt update 8 | apt install -y cmake clang 9 | 10 | cd "$(dirname "$0")" 11 | ../dso_to_tf.py /workspace/models/cuda /workspace/models/tensorflow 12 | 13 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/triton_docker/run_on_tf_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /workspace/sosp23_artifact/setup/triton_docker/build_tvm_tf.sh 4 | /workspace/sosp23_artifact/setup/triton_docker/convert_tvm_to_tf.sh 5 | 6 | -------------------------------------------------------------------------------- /sosp23_artifact/setup/triton_docker/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PREFIX=/workspace 4 | 5 | apt update 6 | apt install -y clang 7 | 8 | echo "export LD_LIBRARY_PATH=${PREFIX}/src/tvm-tf/build:\$LD_LIBRARY_PATH" | tee -a ~/.bashrc 9 | 10 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/densenet-9/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "densenet-9" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000, 26 | 1, 27 | 1 28 | ] 29 | label_filename: "" 30 | is_shape_tensor: false 31 | } 32 | ] 33 | instance_group: [ 34 | { 35 | name: "densenet-9" 36 | kind: KIND_GPU 37 | count: 10 38 | gpus: [ 39 | 0 40 | ] 41 | profile: [] 42 | } 43 | ] 44 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/googlenet-9/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "googlenet-9" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "googlenet-9" 34 | kind: KIND_GPU 35 | count: 8 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/inception_v3/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "inception_v3" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "inception_v3" 34 | kind: KIND_GPU 35 | count: 2, 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/mobilenetv2-7/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "mobilenetv2-7" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "mobilenetv2-7" 34 | kind: KIND_GPU 35 | count: 36, 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/resnet18-v2-7/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "resnet18-v2-7" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "resnet18-v2-7" 34 | kind: KIND_GPU 35 | count: 38 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/resnet34-v2-7/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "resnet34-v2-7" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "resnet34-v2-7" 34 | kind: KIND_GPU 35 | count: 24 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/resnet50-v2-7/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "resnet50-v2-7" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "resnet50-v2-7" 34 | kind: KIND_GPU 35 | count: 10 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tf_models_config/squeezenet1.1-7/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "squeezenet1.1-7" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input: [ 5 | { 6 | name: "input_1" 7 | data_type: TYPE_FP32 8 | format: FORMAT_NONE 9 | dims: [ 10 | 1, 11 | 3, 12 | 224, 13 | 224 14 | ] 15 | is_shape_tensor: false, 16 | allow_ragged_batch: false 17 | } 18 | ] 19 | output: [ 20 | { 21 | name: "output_0" 22 | data_type: TYPE_FP32 23 | dims: [ 24 | 1, 25 | 1000 26 | ] 27 | label_filename: "" 28 | is_shape_tensor: false 29 | } 30 | ] 31 | instance_group: [ 32 | { 33 | name: "squeezenet1.1-7" 34 | kind: KIND_GPU 35 | count: 13 36 | gpus: [ 37 | 0 38 | ] 39 | profile: [] 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /sosp23_artifact/tools/merge_mps_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | def print_latency_stats(f, latencies): 5 | mean = np.mean(latencies); 6 | 7 | sd = np.std(latencies, ddof=1) 8 | 9 | p50 = latencies[int(len(latencies) / 2)]; 10 | p90 = latencies[int(len(latencies) * 0.90)]; 11 | p95 = latencies[int(len(latencies) * 0.95)]; 12 | p99 = latencies[int(len(latencies) * 0.99)]; 13 | max_ele = np.max(latencies) 14 | 15 | f.write(',{},{},{},{},{},{},{}'.format(mean, p50, p90, p95, p99, max_ele, sd)); 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument('-p', '--prefix', dest='prefix'); 21 | parser.add_argument('-i', '--iat', dest='iat', type=int); 22 | parser.add_argument('-n', '--num_jobs', dest='num_jobs', type=int); 23 | 24 | args = parser.parse_args() 25 | 26 | latencies_per_job = [] 27 | aggs = [] 28 | for job_id in range(args.num_jobs): 29 | latencies_per_job.append(np.sort(np.loadtxt('{}_job{}_iat{}_raw.txt'.format(args.prefix, job_id, args.iat)))) 30 | aggs.append(np.loadtxt('{}_job{}.txt'.format(args.prefix, job_id), delimiter=',', ndmin=2)) 31 | latencies_all = np.sort(np.concatenate(latencies_per_job)) 32 | 33 | time_elasped = 0 34 | num_job_instances = 0 35 | for agg in aggs: 36 | for row in range(agg.shape[0]): 37 | if (agg[row, 0] == args.iat): 38 | time_elasped = max(time_elasped, agg[row, 2]) 39 | num_job_instances = int(agg[row, 1]) 40 | break 41 | 42 | with open('{}.txt'.format(args.prefix), 'a') as f: 43 | f.write('{},{},{}'.format(args.iat, num_job_instances, time_elasped)) 44 | print_latency_stats(f, latencies_all) 45 | for latencies in latencies_per_job: 46 | print_latency_stats(f, latencies) 47 | f.write('\n') 48 | 49 | -------------------------------------------------------------------------------- /sosp23_artifact/tools/parse_input_kelvin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def parse_input_kelvin(path, xaxis_name, yaxis_names, model_ids): 4 | yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4} 5 | 6 | data = np.genfromtxt(path, delimiter=',') 7 | 8 | if xaxis_name == 'throughput': 9 | x_data = data[:, 1] / data[:, 2] * 1000000 10 | else: # sending rate 11 | x_data = 1000000. / data[:, 0] 12 | 13 | y_data = [[data[:, model_id * 7 + yaxis_name2offset[yaxis_name] + 3] / 1000. for yaxis_name in yaxis_names] for model_id in model_ids] 14 | 15 | return x_data, y_data 16 | -------------------------------------------------------------------------------- /sosp23_artifact/tools/plot_latency_fairness_threshold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import sys 6 | import argparse 7 | 8 | fmts = ['X-', 'o-', '^-', 's-', 'D-', 'v-', 'p-', '*-', 'H-'] 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('-o', '--output_path', dest='output_path'); 14 | parser.add_argument('-i', '--input_path', dest='input_paths', action='append'); 15 | parser.add_argument('-a', '--algo_name', dest='algo_names', action='append'); 16 | parser.add_argument('-l', '--line', dest='lines', type=int, action='append'); 17 | parser.add_argument('-n', '--name', dest='names', action='append'); 18 | parser.add_argument('--yaxis', dest='yaxises', choices=['Mean', 'p50', 'p90', 'p95', 'p99'], action='append', type=str); 19 | parser.add_argument('-x', '--xlim', dest='xlim', type=float); 20 | parser.add_argument('-y', '--ylim', dest='ylim', type=float); 21 | 22 | args = parser.parse_args() 23 | 24 | yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4} 25 | 26 | num_inputs = len(args.input_paths) 27 | 28 | data = [np.genfromtxt(path, delimiter=',') for path in args.input_paths] 29 | 30 | #plt.rcParams.update({'font.size': 6}) 31 | plt.figure(figsize=(6.4, 3.9552)) 32 | 33 | for i in range(num_inputs): 34 | for line, name, fmt in zip(args.lines, args.names, fmts): 35 | for yaxis in args.yaxises: 36 | plt.plot(data[i][:, 0], data[i][:, line * 7 + yaxis_name2offset[yaxis] + 3] / 1000., fmt, label=name, linewidth=1, markersize=2) 37 | 38 | plt.gca().invert_xaxis() 39 | plt.gca().set_xlim(500, 0) 40 | #plt.xlim(500, 0) 41 | plt.xlabel('Less Fair <- Fairness Threshold -> More Fair') 42 | if len(args.yaxises) == 1: 43 | plt.ylabel(args.yaxises[0] + ' Latency (ms)') 44 | else: 45 | plt.ylabel('Latency (ms)') 46 | 47 | plt.legend() 48 | #if args.xlim is not None: 49 | # plt.xlim(0, args.xlim) 50 | #if args.ylim is not None: 51 | # plt.ylim(0, args.ylim) 52 | 53 | plt.savefig(args.output_path, bbox_inches='tight') 54 | 55 | -------------------------------------------------------------------------------- /sosp23_artifact/triton_server_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo docker run -it --gpus=1 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p8000:8000 -p8001:8001 -p8002:8002 triton_server_tvm bash -c 'LD_LIBRARY_PATH=/workspace/src/tvm-tf/build:$LD_LIBRARY_PATH LD_PRELOAD="/workspace/src/tvm-tf/build/libtvm_dso_op.so /opt/tritonserver/backends/tensorflow2/libtensorflow_cc.so /opt/tritonserver/backends/tensorflow2/libtensorflow_framework.so" CUDA_DEVICE_MAX_CONNECTIONS=32 tritonserver --model-repository=/workspace/models/tensorflow --backend-config=tensorflow,version=2 --min-supported-compute-capability=6.0 --allow-grpc=true --backend-config=default-max-batch-size=0' 4 | 5 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/densenet-9-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 1 1 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/googlenet-9-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/inception_v3-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/mnist-8-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 1 28 28 2 | -1 3 | 1 10 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/mobilenetv2-7-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/resnet18-v2-7-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/resnet34-v2-7-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/resnet50-v2-7-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /sosp23_artifact/tvm_models_dim/squeezenet1.1-7-cuda-pack.so.dim: -------------------------------------------------------------------------------- 1 | 1 3 224 224 2 | -1 3 | 1 1000 4 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(client) 2 | add_subdirectory(server) 3 | add_subdirectory(ipc) 4 | add_subdirectory(job) 5 | -------------------------------------------------------------------------------- /src/client/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(client OBJECT client.cpp job_ref.cpp job_instance_ref.cpp profiler_client.cpp) 2 | target_link_libraries(client spdlog::spdlog) 3 | -------------------------------------------------------------------------------- /src/client/job_instance_ref.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace llis { 12 | namespace client { 13 | 14 | JobInstanceRef::JobInstanceRef(JobRef* job_ref, IoShmEntry io_shm_entry) : job_ref_(job_ref), io_shm_entry_(io_shm_entry) { 15 | c2s_channel_ = job_ref_->get_c2s_channel(); 16 | } 17 | 18 | JobInstanceRef::~JobInstanceRef() { 19 | // TODO 20 | //release(); 21 | } 22 | 23 | void* JobInstanceRef::get_input_ptr() { 24 | return io_shm_entry_.ptr; 25 | } 26 | 27 | void* JobInstanceRef::get_output_ptr() { 28 | return reinterpret_cast(io_shm_entry_.ptr) + job_ref_->get_job()->get_input_size(); 29 | } 30 | 31 | void JobInstanceRef::launch() { 32 | c2s_channel_->acquire_writer_lock(); 33 | 34 | c2s_channel_->write(MsgType::LAUNCH_JOB); 35 | #ifdef PRINT_LAUNCH_JOB_IPC_LATENCY 36 | unsigned long long cur_time = std::chrono::steady_clock::now().time_since_epoch().count(); 37 | c2s_channel_->write(cur_time); 38 | #endif 39 | c2s_channel_->write(job_ref_->get_job_ref_id()); 40 | c2s_channel_->write(io_shm_entry_.id); 41 | c2s_channel_->write(io_shm_entry_.offset); 42 | c2s_channel_->write(this); 43 | 44 | c2s_channel_->release_writer_lock(); 45 | } 46 | 47 | void JobInstanceRef::release() { 48 | job_ref_->release_io_shm_entry(io_shm_entry_); 49 | } 50 | 51 | void JobInstanceRef::set_id(JobInstanceRefId id) { 52 | id_ = id; 53 | } 54 | 55 | JobInstanceRefId JobInstanceRef::get_id() const { 56 | return id_; 57 | } 58 | 59 | JobRefId JobInstanceRef::get_job_ref_id() const { 60 | return job_ref_->get_job_ref_id(); 61 | } 62 | 63 | void JobInstanceRef::set_start_time(double time_point) { 64 | start_time_ = time_point; 65 | } 66 | 67 | double JobInstanceRef::get_start_time() const { 68 | return start_time_; 69 | } 70 | 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /src/ipc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(ipc OBJECT shm_channel.cu shm_primitive_channel.cu name_format.cpp unix_datagram_socket.cpp) 2 | set_target_properties(ipc PROPERTIES POSITION_INDEPENDENT_CODE ON) 3 | 4 | add_library(ipc-gpu OBJECT shm_channel.cu shm_primitive_channel.cu name_format.cpp unix_datagram_socket.cpp) 5 | set_target_properties(ipc-gpu PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON) 6 | -------------------------------------------------------------------------------- /src/ipc/name_format.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace llis { 5 | namespace ipc { 6 | 7 | std::string s2c_socket_name(const std::string& server_name, ClientId client_id) { 8 | return "llis-socket-s2c-" + server_name + "-" + std::to_string(client_id); 9 | } 10 | 11 | std::string s2c_channel_name(const std::string& server_name, ClientId client_id) { 12 | return "s2c:" + server_name + ":" + std::to_string(client_id); 13 | } 14 | 15 | std::string c2s_channel_name(const std::string& server_name) { 16 | return "c2s:" + server_name; 17 | } 18 | 19 | } 20 | } 21 | 22 | -------------------------------------------------------------------------------- /src/ipc/unix_datagram_socket.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace llis { 10 | namespace ipc { 11 | 12 | UnixDatagramSocket::UnixDatagramSocket() { 13 | socket_ = socket(AF_UNIX, SOCK_DGRAM, 0); 14 | utils::error_throw_posix(socket_); 15 | is_owner_ = true; 16 | } 17 | 18 | UnixDatagramSocket::UnixDatagramSocket(const std::string& name) { 19 | socket_ = socket(AF_UNIX, SOCK_DGRAM, 0); 20 | utils::error_throw_posix(socket_); 21 | is_owner_ = true; 22 | 23 | bind(name); 24 | } 25 | 26 | UnixDatagramSocket::UnixDatagramSocket(int socket) : socket_(socket), is_owner_(false) {} 27 | 28 | UnixDatagramSocket::UnixDatagramSocket(UnixDatagramSocket&& rhs) { 29 | *this = std::move(rhs); 30 | } 31 | 32 | UnixDatagramSocket& UnixDatagramSocket::operator=(UnixDatagramSocket&& rhs) { 33 | socket_ = rhs.socket_; 34 | is_owner_ = rhs.is_owner_; 35 | remote_addr_ = rhs.remote_addr_; 36 | 37 | rhs.socket_ = -1; 38 | rhs.is_owner_ = false; 39 | 40 | return *this; 41 | } 42 | 43 | UnixDatagramSocket::~UnixDatagramSocket() { 44 | if (is_owner_) { 45 | utils::warn_log_posix(close(socket_)); 46 | 47 | is_owner_ = false; 48 | } 49 | 50 | socket_ = -1; 51 | } 52 | 53 | void UnixDatagramSocket::bind(const std::string& name) { 54 | sockaddr_un addr; 55 | bzero(&addr, sizeof(addr)); 56 | addr.sun_family = AF_UNIX; 57 | // TODO: check length. It should be < 108 bytes 58 | strncpy(addr.sun_path + 1, name.c_str(), 107); 59 | 60 | utils::error_throw_posix(::bind(socket_, reinterpret_cast(&addr), sizeof(addr))); 61 | } 62 | 63 | UnixDatagramSocket UnixDatagramSocket::connect(const std::string& name) { 64 | UnixDatagramSocket res(socket_); 65 | 66 | bzero(&res.remote_addr_, sizeof(res.remote_addr_)); 67 | res.remote_addr_.sun_family = AF_UNIX; 68 | // TODO: check length. It should be < 108 bytes 69 | strncpy(res.remote_addr_.sun_path + 1, name.c_str(), 107); 70 | 71 | return res; 72 | } 73 | 74 | ssize_t UnixDatagramSocket::write(const void* buf, size_t count) { 75 | ssize_t bytes_sent = sendto(socket_, buf, count, 0, reinterpret_cast(&remote_addr_), sizeof(remote_addr_)); 76 | utils::error_throw_posix(bytes_sent); 77 | return bytes_sent; 78 | } 79 | 80 | ssize_t UnixDatagramSocket::read(void* buf, size_t count) { 81 | ssize_t bytes_read = ::read(socket_, buf, count); 82 | utils::error_throw_posix(bytes_read); 83 | return bytes_read; 84 | } 85 | 86 | } 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/job/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(llis_job SHARED finished_block_notifier.cu $) 2 | set_target_properties(llis_job PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON) 3 | target_link_libraries(llis_job "-Wl,--no-as-needed" Boost::context "-Wl,--as-needed" spdlog::spdlog) 4 | install(TARGETS llis_job DESTINATION lib) 5 | 6 | add_library(llis_job_gpu SHARED finished_block_notifier.cu utils.cu $) 7 | set_target_properties(llis_job_gpu PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON) 8 | target_link_libraries(llis_job_gpu Boost::context spdlog::spdlog) 9 | install(TARGETS llis_job_gpu DESTINATION lib) 10 | 11 | add_library(llis_context SHARED context.cpp) 12 | target_link_libraries(llis_context llis_job spdlog::spdlog) 13 | install(TARGETS llis_context DESTINATION lib) 14 | -------------------------------------------------------------------------------- /src/job/context.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace llis { 6 | namespace job { 7 | 8 | Job* Context::current_job_; 9 | ipc::Gpu2SchedChannel Context::gpu2sched_channel_; 10 | #ifdef LLIS_MEASURE_BLOCK_TIME 11 | ipc::Gpu2SchedChannel Context::gpu2sched_block_time_channel_; 12 | #endif 13 | ipc::ShmChannelCpuWriter Context::mem2sched_channel_; 14 | 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /src/job/finished_block_notifier.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | namespace llis { 8 | namespace job { 9 | 10 | FinishedBlockNotifier::FinishedBlockNotifier(ipc::Gpu2SchedChannel* gpu2sched_channel 11 | #ifdef LLIS_MEASURE_BLOCK_TIME 12 | , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel 13 | #endif 14 | ) { 15 | gpu2sched_channel_ = gpu2sched_channel->fork(); 16 | #ifdef LLIS_MEASURE_BLOCK_TIME 17 | gpu2sched_block_time_channel_ = gpu2sched_block_time_channel->fork(); 18 | #endif 19 | } 20 | 21 | FinishedBlockNotifier* FinishedBlockNotifier::create_array(unsigned num, ipc::Gpu2SchedChannel* gpu2sched_channel 22 | #ifdef LLIS_MEASURE_BLOCK_TIME 23 | , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel 24 | #endif 25 | ) { 26 | FinishedBlockNotifier* res; 27 | utils::error_throw_cuda(cudaMalloc((void**)&res, num * sizeof(FinishedBlockNotifier))); 28 | 29 | std::vector tmp; 30 | tmp.reserve(num); 31 | for (unsigned i = 0; i < num; ++i) { 32 | tmp.emplace_back(gpu2sched_channel 33 | #ifdef LLIS_MEASURE_BLOCK_TIME 34 | , gpu2sched_block_time_channel 35 | #endif 36 | ); 37 | } 38 | 39 | utils::error_throw_cuda(cudaMemcpy(res, tmp.data(), num * sizeof(FinishedBlockNotifier), cudaMemcpyHostToDevice)); 40 | 41 | return res; 42 | } 43 | 44 | void FinishedBlockNotifier::free_array(FinishedBlockNotifier* ptr) { 45 | cudaFree(ptr); 46 | } 47 | 48 | } 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/job/utils.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace llis { 9 | namespace job { 10 | 11 | namespace { 12 | 13 | __global__ void memset_impl(void* ptr, int val, size_t count, JobId job_id, ipc::Gpu2SchedChannel gpu2sched_channel 14 | #ifdef LLIS_MEASURE_BLOCK_TIME 15 | , ipc::Gpu2SchedChannel gpu2sched_block_time_channel 16 | #endif 17 | ) { 18 | #ifdef LLIS_MEASURE_BLOCK_TIME 19 | BlockStartEndTime start_end_time; 20 | kernel_start(job_id, &gpu2sched_channel, &start_end_time); 21 | #else 22 | kernel_start(job_id, &gpu2sched_channel); 23 | #endif 24 | 25 | int id = blockIdx.x * blockDim.x + threadIdx.x; 26 | 27 | if (id < count) { 28 | (reinterpret_cast(ptr))[id] = val; 29 | } 30 | 31 | #ifdef LLIS_MEASURE_BLOCK_TIME 32 | kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time); 33 | #else 34 | kernel_end(job_id, &gpu2sched_channel); 35 | #endif 36 | } 37 | 38 | } 39 | 40 | void memset_res(size_t count, Job* job) { 41 | constexpr int num_threads_per_block = 256; 42 | 43 | job->set_num_blocks(std::ceil((float)count / (float)num_threads_per_block)); 44 | job->set_num_threads_per_block(num_threads_per_block); 45 | job->set_num_registers_per_thread(32); 46 | job->set_smem_size_per_block(0); 47 | } 48 | 49 | void memset(void* ptr, int val, size_t count, Job* job, ipc::Gpu2SchedChannel* gpu2sched_channel 50 | #ifdef LLIS_MEASURE_BLOCK_TIME 51 | , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel 52 | #endif 53 | ) { 54 | constexpr int num_threads_per_block = 256; 55 | 56 | #ifdef LLIS_MEASURE_BLOCK_TIME 57 | memset_impl<<get_num_blocks(), num_threads_per_block, job->get_smem_size_per_block(), job->get_cuda_stream()>>>(ptr, val, count, job->get_id(), gpu2sched_channel->fork(), gpu2sched_block_time_channel->fork()); 58 | #else 59 | memset_impl<<get_num_blocks(), num_threads_per_block, job->get_smem_size_per_block(), job->get_cuda_stream()>>>(ptr, val, count, job->get_id(), gpu2sched_channel->fork()); 60 | #endif 61 | } 62 | 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(llis_server server.cpp scheduler.cpp scheduler_fifo.cpp scheduler_fifo2.cpp scheduler_full3.cpp client_connection.cpp registered_job.cpp gpu_resources.cpp sm_resources.cpp profiler.cpp $ $) 2 | target_link_libraries(llis_server llis_job llis_context spdlog::spdlog Boost::program_options rt dl) 3 | install(TARGETS llis_server DESTINATION bin) 4 | -------------------------------------------------------------------------------- /src/server/client_connection.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eniac/paella/e2ed7b53272eb393e1361b0a87ceb8974efc237d/src/server/client_connection.cpp -------------------------------------------------------------------------------- /src/server/scheduler.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace llis { 4 | namespace server { 5 | 6 | std::unordered_map SchedulerFactory::registered_schedulers_; 7 | 8 | bool SchedulerFactory::register_scheduler(std::string name, RegisterFunc func) { 9 | if (registered_schedulers_.find(name) == registered_schedulers_.end()) { 10 | registered_schedulers_.emplace(name, func); 11 | return true; 12 | } else { 13 | return false; 14 | } 15 | } 16 | 17 | std::unique_ptr SchedulerFactory::create(std::string name, const po::variables_map& args) { 18 | auto it = registered_schedulers_.find(name); 19 | if (it == registered_schedulers_.end()) { 20 | return nullptr; 21 | } else { 22 | return it->second(args); 23 | } 24 | } 25 | 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(ipc) 2 | add_subdirectory(client) 3 | add_subdirectory(simple) 4 | add_subdirectory(utils) 5 | -------------------------------------------------------------------------------- /tests/client/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(test_client client.cpp $ $) 2 | target_link_libraries(test_client spdlog::spdlog dl rt) 3 | 4 | add_executable(test_client_concurrent_runs client_concurrent_runs.cpp $ $) 5 | target_link_libraries(test_client_concurrent_runs spdlog::spdlog dl rt) 6 | 7 | add_executable(test_client_single_latency client_single_latency.cpp $ $) 8 | target_link_libraries(test_client_single_latency spdlog::spdlog dl rt) 9 | 10 | add_executable(test_client_concurrent_run_latencies client_concurrent_run_latencies.cpp $ $) 11 | target_link_libraries(test_client_concurrent_run_latencies spdlog::spdlog dl rt) 12 | 13 | add_executable(test_client_concurrent_run_latencies_set_load client_concurrent_run_latencies_set_load.cpp $ $) 14 | target_link_libraries(test_client_concurrent_run_latencies_set_load spdlog::spdlog dl rt) 15 | 16 | add_executable(test_client_concurrent_run_latencies_set_load_multi client_concurrent_run_latencies_set_load_multi.cpp $ $) 17 | target_link_libraries(test_client_concurrent_run_latencies_set_load_multi spdlog::spdlog dl rt) 18 | 19 | add_executable(test_raw_kernel_launch raw_kernel_launch.cu $) 20 | target_link_libraries(test_raw_kernel_launch llis_job dl rt) 21 | set_target_properties(test_raw_kernel_launch PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 22 | -------------------------------------------------------------------------------- /tests/client/client.cpp: -------------------------------------------------------------------------------- 1 | #include "llis/client/job_instance_ref.h" 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | const char* server_name = argv[1]; 6 | const char* job_path = argv[2]; 7 | 8 | llis::client::Client client(server_name); 9 | llis::client::JobRef job_ref = client.register_job(job_path); 10 | llis::client::JobInstanceRef* job_instance_ref = job_ref.create_instance(); 11 | job_instance_ref->launch(); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/client/client_concurrent_runs.cpp: -------------------------------------------------------------------------------- 1 | #include "llis/client/job_instance_ref.h" 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | const char* server_name = argv[1]; 6 | const char* job_path = argv[2]; 7 | int num = atoi(argv[3]); 8 | 9 | llis::client::Client client(server_name); 10 | llis::client::JobRef job_ref = client.register_job(job_path); 11 | 12 | std::vector job_instance_refs; 13 | job_instance_refs.reserve(num); 14 | 15 | for (int i = 0; i < num; ++i) { 16 | job_instance_refs.push_back(job_ref.create_instance()); 17 | job_instance_refs.back()->launch(); 18 | } 19 | } 20 | 21 | -------------------------------------------------------------------------------- /tests/client/client_single_latency.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | int main(int argc, char** argv) { 8 | const char* server_name = argv[1]; 9 | const char* job_path = argv[2]; 10 | int num = atoi(argv[3]); 11 | const char* profile_path = nullptr; 12 | if (argc >= 5) { 13 | profile_path = argv[4]; 14 | } 15 | 16 | llis::client::Client client(server_name); 17 | llis::client::JobRef job_ref = client.register_job(job_path); 18 | llis::client::JobInstanceRef* job_instance_ref = job_ref.create_instance(); 19 | 20 | client.get_profiler_client()->set_record_kernel_info(); 21 | 22 | for (int i = 0; i < num; ++i) { 23 | auto start_time = std::chrono::steady_clock::now(); 24 | job_instance_ref->launch(); 25 | client.wait(); 26 | auto end_time = std::chrono::steady_clock::now(); 27 | 28 | auto time_taken = end_time - start_time; 29 | 30 | std::cout << std::chrono::duration(time_taken).count() << std::endl; 31 | } 32 | 33 | client.get_profiler_client()->unset_record_kernel_info(); 34 | if (profile_path != nullptr) { 35 | client.get_profiler_client()->save(profile_path); 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /tests/client/raw_kernel_launch.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | __global__ void helloworld(int i, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) { 10 | //notifier->start(job_id); 11 | //notifier->end(job_id); 12 | } 13 | 14 | int main(int argc, char** argv) { 15 | int num_blocks = atoi(argv[1]); 16 | int num_iters = atoi(argv[2]); 17 | 18 | cudaStream_t stream; 19 | cudaStreamCreate(&stream); 20 | 21 | llis::ipc::Gpu2SchedChannel gpu2sched_channel(1024); 22 | llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel(1024); 23 | 24 | llis::job::FinishedBlockNotifier* finished_block_notifier = llis::job::FinishedBlockNotifier::create_array(1, &gpu2sched_channel 25 | #ifdef LLIS_MEASURE_BLOCK_TIME 26 | , &gpu2sched_block_time_channel 27 | #endif 28 | ); 29 | 30 | for (int i = 0; i < num_iters; ++i) { 31 | auto start_time = std::chrono::steady_clock::now(); 32 | 33 | helloworld<<>>(i, 0, finished_block_notifier); 34 | cudaStreamSynchronize(stream); 35 | 36 | auto end_time = std::chrono::steady_clock::now(); 37 | 38 | auto time_taken = end_time - start_time; 39 | std::cout << std::chrono::duration(time_taken).count() << std::endl; 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /tests/ipc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(shm_channel) 2 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(latency) 2 | 3 | add_executable(shmc_read shmc_read.cpp $) 4 | target_link_libraries(shmc_read rt) 5 | add_executable(shmc_write shmc_write.cpp $) 6 | target_link_libraries(shmc_write rt) 7 | add_executable(shmc_read_write shmc_read_write.cpp $) 8 | target_link_libraries(shmc_read_write rt) 9 | add_executable(shmc_read_write_same_proc shmc_read_write_same_proc.cpp $) 10 | target_link_libraries(shmc_read_write_same_proc rt pthread) 11 | add_executable(shmc_read_write_cpu_gpu shmc_read_write_cpu_gpu.cu $) 12 | set_target_properties(shmc_read_write_cpu_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 13 | target_link_libraries(shmc_read_write_cpu_gpu rt pthread) 14 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(shmc_latency_read shmc_latency_read.cpp $) 2 | target_link_libraries(shmc_latency_read rt) 3 | add_executable(shmc_latency_write shmc_latency_write.cpp $) 4 | target_link_libraries(shmc_latency_write rt) 5 | add_executable(shmc_latency_read_bare_atomic shmc_latency_read_bare_atomic.cpp $) 6 | target_link_libraries(shmc_latency_read_bare_atomic rt) 7 | add_executable(shmc_latency_write_bare_atomic shmc_latency_write_bare_atomic.cpp $) 8 | target_link_libraries(shmc_latency_write_bare_atomic rt) 9 | add_executable(shmc_latency_read_bare_atomic_loop shmc_latency_read_bare_atomic_loop.cpp $) 10 | target_link_libraries(shmc_latency_read_bare_atomic_loop rt) 11 | add_executable(shmc_latency_write_bare_atomic_loop shmc_latency_write_bare_atomic_loop.cpp $) 12 | target_link_libraries(shmc_latency_write_bare_atomic_loop rt) 13 | add_executable(shmc_latency_read_loop shmc_latency_read_loop.cpp $) 14 | target_link_libraries(shmc_latency_read_loop rt) 15 | add_executable(shmc_latency_write_loop shmc_latency_write_loop.cpp $) 16 | target_link_libraries(shmc_latency_write_loop rt) 17 | add_executable(shmpc_latency_gpu shmpc_latency_gpu.cu) 18 | target_link_libraries(shmpc_latency_gpu llis_job rt) 19 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_read.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | int main() { 7 | int val; 8 | 9 | // The channel has a size of sizeof(val) + 1 10 | // This makes sure that the writer can only write after the reader has read 11 | // +1 because the writer always wastes one byte 12 | llis::ipc::ShmChannelCpuReader channel("test", sizeof(val) + 1); 13 | 14 | // The first read is a barrier to make sure that both sides are in the same stage 15 | channel.read(&val, sizeof(val)); 16 | 17 | channel.read(&val, sizeof(val)); 18 | 19 | std::cout << "Current time since epoch: " << std::chrono::system_clock::now().time_since_epoch().count() << std::endl; 20 | 21 | std::cout << "Value: " << val << std::endl; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_read_bare_atomic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int main() { 12 | std::string name_with_prefix = "ml-on-apu:test"; 13 | int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600); 14 | ftruncate(fd_, sizeof(std::atomic_bool)); 15 | std::atomic_char* shm_ = reinterpret_cast(mmap(nullptr, sizeof(std::atomic_char), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0)); 16 | 17 | int a = shm_->load(std::memory_order_acquire); 18 | int b = shm_->load(std::memory_order_acquire); 19 | 20 | while (shm_->load(std::memory_order_acquire) != 3) {} 21 | 22 | //while (shm_->load(std::memory_order_acquire)) {} 23 | auto cur_time = std::chrono::system_clock::now().time_since_epoch().count(); 24 | std::cout << "Current time since epoch: " << cur_time << std::endl; 25 | std::cout << "a: " << a << std::endl; 26 | std::cout << "b: " << b << std::endl; 27 | } 28 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_read_bare_atomic_loop.cpp: -------------------------------------------------------------------------------- 1 | #define NUM_ITERS 1000000 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | int main() { 14 | std::string name_with_prefix = "ml-on-apu:test"; 15 | int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600); 16 | ftruncate(fd_, sizeof(std::atomic_bool)); 17 | std::atomic_int* shm_ = reinterpret_cast(mmap(nullptr, sizeof(std::atomic_int), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0)); 18 | 19 | int a = shm_->load(std::memory_order_acquire); 20 | int b = shm_->load(std::memory_order_acquire); 21 | 22 | auto time1 = std::chrono::system_clock::now().time_since_epoch().count(); 23 | 24 | for (int i = 1; i < NUM_ITERS; ++i) { 25 | while (shm_->load(std::memory_order_acquire) != i) {} 26 | shm_->store(0, std::memory_order_release); 27 | } 28 | while (shm_->load(std::memory_order_acquire) != NUM_ITERS) {} 29 | 30 | auto time2 = std::chrono::system_clock::now().time_since_epoch().count(); 31 | 32 | std::cout << "time1: " << time1 << std::endl; 33 | std::cout << "time2: " << time2 << std::endl; 34 | std::cout << "a: " << a << std::endl; 35 | std::cout << "b: " << b << std::endl; 36 | } 37 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_read_loop.cpp: -------------------------------------------------------------------------------- 1 | #define CHANNEL_SIZE 64 2 | #define NUM_ITERS 1000000 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | int main() { 10 | int val = 0; 11 | 12 | llis::ipc::ShmChannelCpuReader read_channel("test_read", 64); 13 | llis::ipc::ShmChannelCpuWriter write_channel("test_write", 64); 14 | 15 | // Warm up 16 | write_channel.write(&val, sizeof(val)); 17 | write_channel.write(&val, sizeof(val)); 18 | read_channel.read(&val, sizeof(val)); 19 | read_channel.read(&val, sizeof(val)); 20 | 21 | auto time1 = std::chrono::system_clock::now().time_since_epoch().count(); 22 | 23 | for (int i = 1; i < NUM_ITERS; ++i) { 24 | read_channel.read(&val, sizeof(val)); 25 | write_channel.write(&i, sizeof(i)); 26 | } 27 | read_channel.read(&val, sizeof(val)); 28 | 29 | auto time2 = std::chrono::system_clock::now().time_since_epoch().count(); 30 | 31 | std::cout << "time1: " << time1 << std::endl; 32 | std::cout << "time2: " << time2 << std::endl; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_write.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | int main() { 7 | int val = 1234; 8 | int val2 = 5678; 9 | 10 | llis::ipc::ShmChannelCpuWriter channel("test", sizeof(val) + 1); 11 | 12 | channel.write(&val2, sizeof(val2)); 13 | 14 | 15 | channel.write(&val, sizeof(val)); 16 | std::cout << "Current time since epoch: " << std::chrono::system_clock::now().time_since_epoch().count() << std::endl; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_write_bare_atomic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int main() { 12 | std::string name_with_prefix = "ml-on-apu:test"; 13 | int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600); 14 | ftruncate(fd_, sizeof(std::atomic_bool)); 15 | std::atomic_char* shm_ = reinterpret_cast(mmap(nullptr, sizeof(std::atomic_char), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0)); 16 | 17 | shm_->store(1, std::memory_order_release); 18 | shm_->store(2, std::memory_order_release); 19 | 20 | auto time1 = std::chrono::system_clock::now().time_since_epoch().count(); 21 | shm_->store(3, std::memory_order_release); 22 | 23 | auto time3 = std::chrono::system_clock::now().time_since_epoch().count(); 24 | 25 | std::cout << "time1: " << time1 << std::endl; 26 | //std::cout << "time2: " << time2 << std::endl; 27 | std::cout << "time3: " << time3 << std::endl; 28 | } 29 | 30 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_write_bare_atomic_loop.cpp: -------------------------------------------------------------------------------- 1 | #define NUM_ITERS 1000000 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | int main() { 14 | std::string name_with_prefix = "ml-on-apu:test"; 15 | int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600); 16 | ftruncate(fd_, sizeof(std::atomic_bool)); 17 | std::atomic_int* shm_ = reinterpret_cast(mmap(nullptr, sizeof(std::atomic_int), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0)); 18 | 19 | shm_->store(0, std::memory_order_release); 20 | shm_->store(2, std::memory_order_release); 21 | 22 | auto time1 = std::chrono::system_clock::now().time_since_epoch().count(); 23 | 24 | for (int i = 1; i < NUM_ITERS; ++i) { 25 | shm_->store(i, std::memory_order_release); 26 | while (shm_->load(std::memory_order_acquire) != 0) {} 27 | } 28 | shm_->store(NUM_ITERS, std::memory_order_release); 29 | 30 | auto time2 = std::chrono::system_clock::now().time_since_epoch().count(); 31 | 32 | std::cout << "time1: " << time1 << std::endl; 33 | std::cout << "time2: " << time2 << std::endl; 34 | } 35 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmc_latency_write_loop.cpp: -------------------------------------------------------------------------------- 1 | #define CHANNEL_SIZE 64 2 | #define NUM_ITERS 1000000 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | int main() { 10 | int val = 0; 11 | 12 | llis::ipc::ShmChannelCpuWriter write_channel("test_write", 64); 13 | llis::ipc::ShmChannelCpuReader read_channel("test_read", 64); 14 | 15 | // Warm up 16 | write_channel.write(&val, sizeof(val)); 17 | write_channel.write(&val, sizeof(val)); 18 | read_channel.read(&val, sizeof(val)); 19 | read_channel.read(&val, sizeof(val)); 20 | 21 | auto time1 = std::chrono::system_clock::now().time_since_epoch().count(); 22 | 23 | for (int i = 1; i < NUM_ITERS; ++i) { 24 | write_channel.write(&i, sizeof(i)); 25 | read_channel.read(&val, sizeof(val)); 26 | } 27 | val = NUM_ITERS; 28 | write_channel.write(&val, sizeof(val)); 29 | 30 | auto time2 = std::chrono::system_clock::now().time_since_epoch().count(); 31 | 32 | std::cout << "time1: " << time1 << std::endl; 33 | std::cout << "time2: " << time2 << std::endl; 34 | } 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/latency/shmpc_latency_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | __global__ void kernel(float* output, float* input, unsigned count, llis::ipc::ShmPrimitiveChannelGpu channel) { 8 | //__global__ void kernel(llis::ipc::ShmPrimitiveChannelGpu channel) { 9 | if (threadIdx.x == 0) { 10 | llis::job::InstrumentInfo info; 11 | info.is_start = true; 12 | info.job_id = blockIdx.x; 13 | channel.write(info); 14 | } 15 | 16 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 17 | unsigned grid_size = blockDim.x * gridDim.x; 18 | 19 | while (id < count) { 20 | output[id] += input[id]; 21 | id += grid_size; 22 | } 23 | 24 | __syncthreads(); 25 | 26 | if (threadIdx.x == 0) { 27 | llis::job::InstrumentInfo info; 28 | info.is_start = false; 29 | info.job_id = blockIdx.x; 30 | channel.write(info); 31 | } 32 | } 33 | 34 | int main(int argc, char** argv) { 35 | unsigned num_blocks = atoi(argv[1]); 36 | unsigned vec_len = atoi(argv[2]); 37 | unsigned num_iters = atoi(argv[3]); 38 | 39 | float* x; 40 | float* y; 41 | cudaMalloc(&x, sizeof(*x) * vec_len); 42 | cudaMalloc(&y, sizeof(*y) * vec_len); 43 | 44 | llis::ipc::ShmPrimitiveChannelGpu channel("", 1024000); 45 | 46 | for (int i = 0; i < num_iters; ++i) { 47 | kernel<<>>(y, x, vec_len, channel.fork()); 48 | //kernel<<>>(channel.fork()); 49 | for (int j = 0; j < num_blocks * 2; ++j) { 50 | (void)channel.read(); 51 | } 52 | } 53 | 54 | auto start_time = std::chrono::steady_clock::now(); 55 | 56 | for (int i = 0; i < num_iters; ++i) { 57 | kernel<<>>(y, x, vec_len, channel.fork()); 58 | //kernel<<>>(channel.fork()); 59 | for (int j = 0; j < num_blocks * 2; ++j) { 60 | (void)channel.read(); 61 | } 62 | } 63 | 64 | auto end_time = std::chrono::steady_clock::now(); 65 | 66 | std::cout << "Time elasped: " << std::chrono::duration(end_time - start_time).count() / (double)num_iters << " us" << std::endl; 67 | } 68 | 69 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/shmc_read.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | llis::ipc::ShmChannelCpuReader channel("test", 64); 5 | for (int i = 0; i < 10000; ++i) { 6 | int val; 7 | channel.read(&val, sizeof(val)); 8 | if (val != i) { 9 | printf("Error! Expected: %d, Actual: %d\n", i, val); 10 | break; 11 | } 12 | } 13 | } 14 | 15 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/shmc_read_write.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | llis::ipc::ShmChannelCpuReader channelRead("test"); 5 | llis::ipc::ShmChannelCpuWriter channelWrite = channelRead.fork(); 6 | for (int i = 0; i < 10000; ++i) { 7 | channelWrite.write(&i, sizeof(i)); 8 | int val = -1; 9 | channelRead.read(&val, sizeof(val)); 10 | if (val != i) { 11 | printf("Error! Expected: %d, Actual: %d\n", i, val); 12 | break; 13 | } 14 | } 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/shmc_read_write_cpu_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | void reader(llis::ipc::ShmChannelGpuReader* channel) { 6 | for (int i = 0; i < 10000; ++i) { 7 | int val; 8 | channel->read(&val, sizeof(val)); 9 | if (val != i) { 10 | printf("Error! Expected: %d, Actual: %d\n", i, val); 11 | break; 12 | } 13 | } 14 | } 15 | 16 | __global__ void writer(llis::ipc::ShmChannelGpuWriter channel) { 17 | for (int i = 0; i < 10000; ++i) { 18 | channel.write(i); 19 | } 20 | } 21 | 22 | int main() { 23 | llis::ipc::ShmChannelGpuReader channel(64); 24 | llis::ipc::ShmChannelGpuWriter channel_gpu(&channel); 25 | 26 | std::thread reader_thr(reader, &channel); 27 | 28 | writer<<<1, 1>>>(std::move(channel_gpu)); 29 | 30 | reader_thr.join(); 31 | cudaDeviceSynchronize(); 32 | } 33 | 34 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/shmc_read_write_same_proc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | void reader(llis::ipc::ShmChannelCpuReader* channel) { 6 | for (int i = 0; i < 10000; ++i) { 7 | int val; 8 | channel->read(&val, sizeof(val)); 9 | if (val != i) { 10 | printf("Error! Expected: %d, Actual: %d\n", i, val); 11 | break; 12 | } 13 | } 14 | } 15 | 16 | void writer(llis::ipc::ShmChannelCpuWriter* channel) { 17 | for (int i = 0; i < 10000; ++i) { 18 | channel->write(i); 19 | } 20 | } 21 | 22 | int main() { 23 | llis::ipc::ShmChannelCpuReader channelRead(64); 24 | llis::ipc::ShmChannelCpuWriter channelWrite = channelRead.fork(); 25 | 26 | std::thread reader_thr(reader, &channelRead); 27 | std::thread writer_thr(writer, &channelWrite); 28 | 29 | reader_thr.join(); 30 | writer_thr.join(); 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tests/ipc/shm_channel/shmc_write.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | llis::ipc::ShmChannelCpuWriter channel("test"); 5 | for (int i = 0; i < 10000; ++i) { 6 | channel.write(&i, sizeof(i)); 7 | } 8 | } 9 | 10 | -------------------------------------------------------------------------------- /tests/simple/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #add_executable(test_direct_run_job direct_run_job.cpp $) 2 | #target_link_libraries(test_direct_run_job llis_job_gpu llis_context CUDA::cudart dl rt) 3 | 4 | if(tvm_FOUND) 5 | add_executable(tvm_direct_concurrent tvm_direct_concurrent.cpp) 6 | target_link_libraries(tvm_direct_concurrent tvm::tvm_runtime llis_context CUDA::cudart pthread) 7 | 8 | add_executable(tvm_direct_multistream tvm_direct_multistream.cpp) 9 | target_link_libraries(tvm_direct_multistream tvm::tvm_runtime llis_context CUDA::cudart pthread) 10 | target_compile_definitions(tvm_direct_multistream PRIVATE SUBMIT_DIS) 11 | install(TARGETS tvm_direct_multistream DESTINATION bin) 12 | 13 | add_executable(tvm_direct_multistream_pregen tvm_direct_multistream.cpp) 14 | target_link_libraries(tvm_direct_multistream_pregen tvm::tvm_runtime llis_context CUDA::cudart pthread) 15 | target_compile_definitions(tvm_direct_multistream_pregen PRIVATE SUBMIT_PREGEN) 16 | target_compile_options(tvm_direct_multistream_pregen PUBLIC "-fPIC" PUBLIC "-fPIE") 17 | install(TARGETS tvm_direct_multistream_pregen DESTINATION bin) 18 | endif(tvm_FOUND) 19 | 20 | add_executable(cuda_sync_benchmark cuda_sync_benchmark.cu) 21 | target_link_libraries(cuda_sync_benchmark CUDA::cudart dl rt) 22 | 23 | add_executable(cuda_callback_benchmark cuda_callback_benchmark.cu) 24 | target_link_libraries(cuda_callback_benchmark CUDA::cudart dl rt) 25 | 26 | add_executable(test_mmap_mlock_limit mmap_mlock_limit.cpp) 27 | target_link_libraries(test_mmap_mlock_limit rt) 28 | -------------------------------------------------------------------------------- /tests/simple/cuda_callback_benchmark.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | std::vector streams; 13 | std::queue noti_queue; 14 | std::atomic_uint noti_queue_num; 15 | std::mutex mtx; 16 | 17 | __global__ void dummy() { 18 | } 19 | 20 | void callback(void* stream_id_) { 21 | unsigned stream_id = (unsigned long)stream_id_; 22 | std::unique_lock lock(mtx); 23 | noti_queue.push(stream_id); 24 | lock.unlock(); 25 | noti_queue_num.fetch_add(1, std::memory_order_release); 26 | } 27 | 28 | int main(int argc, char** argv) { 29 | unsigned num_iter = atoi(argv[1]); 30 | unsigned num_streams = atoi(argv[2]); 31 | 32 | streams.resize(num_streams); 33 | for (unsigned i = 0; i < num_streams; ++i) { 34 | cudaStreamCreate(&streams[i]); 35 | } 36 | 37 | std::vector streams_finished; 38 | streams_finished.resize(num_streams); 39 | 40 | unsigned total_finished = 0; 41 | const unsigned total_num = num_iter * num_streams; 42 | 43 | noti_queue_num.store(0); 44 | 45 | auto start_time = std::chrono::steady_clock::now(); 46 | 47 | for (unsigned i = 0; i < num_streams; ++i) { 48 | dummy<<<1, 1, 0, streams[i]>>>(); 49 | cudaLaunchHostFunc(streams[i], callback, (void*)i); 50 | } 51 | 52 | while (total_finished < total_num) { 53 | while (noti_queue_num.load(std::memory_order_acquire) == 0); 54 | noti_queue_num.fetch_sub(1, std::memory_order_release); 55 | 56 | std::unique_lock lock(mtx); 57 | unsigned stream_id = noti_queue.front(); 58 | noti_queue.pop(); 59 | lock.unlock(); 60 | 61 | ++streams_finished[stream_id]; 62 | ++total_finished; 63 | 64 | if (streams_finished[stream_id] < num_iter) { 65 | dummy<<<1, 1, 0, streams[stream_id]>>>(); 66 | cudaLaunchHostFunc(streams[stream_id], callback, (void*)stream_id); 67 | } 68 | } 69 | 70 | auto end_time = std::chrono::steady_clock::now(); 71 | 72 | double time_elasped = std::chrono::duration(end_time - start_time).count(); 73 | 74 | printf("%f\n", time_elasped); 75 | } 76 | 77 | -------------------------------------------------------------------------------- /tests/simple/cuda_sync_benchmark.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | __global__ void dummy_cuda_sync() { 10 | } 11 | 12 | __global__ void dummy_flag(volatile int* flag) { 13 | __threadfence_system(); 14 | *flag = 1; 15 | } 16 | 17 | void run_cuda_sync(int num_iter, cudaStream_t stream) { 18 | for (int i = 0; i < num_iter; ++i) { 19 | dummy_cuda_sync<<<1, 1, 0, stream>>>(); 20 | cudaStreamSynchronize(stream); 21 | } 22 | } 23 | 24 | void run_flag(int num_iter, cudaStream_t stream, volatile int* flag) { 25 | for (int i = 0; i < num_iter; ++i) { 26 | *flag = 0; 27 | 28 | dummy_flag<<<1, 1, 0, stream>>>(flag); 29 | 30 | while (*flag == 0); 31 | } 32 | } 33 | 34 | int main(int argc, char** argv) { 35 | int num_iter = atoi(argv[1]); 36 | int num_thrs = atoi(argv[2]); 37 | 38 | std::vector streams; 39 | streams.resize(num_thrs); 40 | for (int i = 0; i < num_thrs; ++i) { 41 | cudaStreamCreate(&streams[i]); 42 | } 43 | 44 | volatile int* flags; 45 | cudaMallocHost(&flags, sizeof(int) * num_thrs); 46 | 47 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); 48 | 49 | double time_cuda_sync; 50 | double time_flag; 51 | 52 | { 53 | std::vector thrs; 54 | auto start_time = std::chrono::steady_clock::now(); 55 | for (int i = 0; i < num_thrs; ++i) { 56 | thrs.emplace_back(run_cuda_sync, num_iter, streams[i]); 57 | } 58 | for (auto& thr : thrs) { 59 | thr.join(); 60 | } 61 | auto end_time = std::chrono::steady_clock::now(); 62 | time_cuda_sync = std::chrono::duration(end_time - start_time).count(); 63 | } 64 | 65 | { 66 | std::vector thrs; 67 | auto start_time = std::chrono::steady_clock::now(); 68 | for (int i = 0; i < num_thrs; ++i) { 69 | thrs.emplace_back(run_flag, num_iter, streams[i], flags + i); 70 | } 71 | for (auto& thr : thrs) { 72 | thr.join(); 73 | } 74 | auto end_time = std::chrono::steady_clock::now(); 75 | time_flag = std::chrono::duration(end_time - start_time).count(); 76 | } 77 | 78 | printf("%f,%f\n", time_cuda_sync, time_flag); 79 | } 80 | 81 | -------------------------------------------------------------------------------- /tests/simple/mmap_mlock_limit.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | int main(int argc, char** argv) { 10 | unsigned num = atoi(argv[1]); 11 | unsigned base_size = atoi(argv[2]); 12 | 13 | int shm_fd = shm_open("test_mmap_mlock_limit", O_RDWR | O_CREAT, 0600); 14 | 15 | if (shm_fd == -1) { 16 | printf("shm_open error: %d\n", errno); 17 | } 18 | 19 | size_t total_size = 0; 20 | for (int i = 0; i < num; ++i) { 21 | size_t this_size = base_size * (i + 1); 22 | total_size += this_size; 23 | 24 | int trunc_ret = ftruncate(shm_fd, total_size); 25 | if (trunc_ret == -1) { 26 | printf("ftruncate error: %d, i = %d, total_size: %lu, this_size: %lu\n", errno, i, total_size, this_size); 27 | break; 28 | } 29 | 30 | void* ptr = mmap(nullptr, this_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, total_size - this_size); 31 | if (ptr == MAP_FAILED) { 32 | printf("mmap error: %d, i = %d, total_size: %lu, this_size: %lu\n", errno, i, total_size, this_size); 33 | break; 34 | } 35 | 36 | int mlock_ret = mlock(ptr, this_size); 37 | if (mlock_ret == -1) { 38 | printf("mlock error: %d, i = %d, total_size: %lu, this_size: %lu\n", errno, i, total_size, this_size); 39 | break; 40 | } 41 | } 42 | 43 | shm_unlink("test_mmap_mlock_limit"); 44 | } 45 | 46 | -------------------------------------------------------------------------------- /tests/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(workload_pregen workload_pregen.cpp) 2 | install(TARGETS workload_pregen DESTINATION bin) 3 | -------------------------------------------------------------------------------- /tools/calculate_jains_fairness_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import argparse 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-o', '--output_path', dest='output_path'); 9 | parser.add_argument('-b', '--input_baseline_path', dest='input_baseline_paths', action='append'); 10 | parser.add_argument('-i', '--input_path', dest='input_path'); 11 | 12 | args = parser.parse_args() 13 | 14 | data = np.genfromtxt(args.input_path, delimiter=',') 15 | baseline_data = [np.genfromtxt(path, delimiter=',') for path in args.input_baseline_paths] 16 | 17 | baseline_latencies = np.array([x[2] for x in baseline_data], dtype=float) 18 | 19 | slowdown_factors = data[:, 9::7] / baseline_latencies 20 | 21 | fairness_indices = np.sum(slowdown_factors, axis=1, keepdims=True) ** 2. / (np.sum(slowdown_factors ** 2, axis=1, keepdims=True) * len(baseline_latencies)) 22 | 23 | results = np.concatenate([data[:, 0:1], fairness_indices, slowdown_factors], axis=1) 24 | 25 | np.savetxt(args.output_path, results, delimiter=',') 26 | -------------------------------------------------------------------------------- /tools/parse_input_kelvin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def parse_input_kelvin(path, xaxis_name, yaxis_names, model_ids): 4 | yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4} 5 | 6 | data = np.genfromtxt(path, delimiter=',') 7 | 8 | if xaxis_name == 'throughput': 9 | x_data = data[:, 1] / data[:, 2] * 1000000 10 | else: # sending rate 11 | x_data = 1000000. / data[:, 0] 12 | 13 | y_data = [[data[:, model_id * 7 + yaxis_name2offset[yaxis_name] + 3] / 1000. for yaxis_name in yaxis_names] for model_id in model_ids] 14 | 15 | return x_data, y_data 16 | -------------------------------------------------------------------------------- /tools/plot_block_exec_times_cdf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import sys 6 | 7 | if __name__ == "__main__": 8 | output_path = sys.argv[1] 9 | input_paths = sys.argv[2::2] 10 | labels = sys.argv[3::2] 11 | num_inputs = len(input_paths) 12 | 13 | data = [np.genfromtxt(path, delimiter=' ') for path in input_paths] 14 | 15 | for i in range(num_inputs): 16 | exec_times = data[i][:, 1] - data[i][:, 0] 17 | print('Mean:', np.mean(exec_times)) 18 | exec_times = np.sort(exec_times) 19 | plt.plot(exec_times, np.arange(len(exec_times)) / len(exec_times), '-', label=labels[i]) 20 | 21 | plt.legend() 22 | #plt.ylim(0, 1000000) 23 | 24 | plt.savefig(output_path) 25 | 26 | 27 | -------------------------------------------------------------------------------- /tools/plot_latency_fairness_threshold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import sys 6 | import argparse 7 | 8 | fmts = ['X-', 'o-', '^-', 's-', 'D-', 'v-', 'p-', '*-', 'H-'] 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('-o', '--output_path', dest='output_path'); 14 | parser.add_argument('-i', '--input_path', dest='input_paths', action='append'); 15 | parser.add_argument('-a', '--algo_name', dest='algo_names', action='append'); 16 | parser.add_argument('-l', '--line', dest='lines', type=int, action='append'); 17 | parser.add_argument('-n', '--name', dest='names', action='append'); 18 | parser.add_argument('--yaxis', dest='yaxises', choices=['Mean', 'p50', 'p90', 'p95', 'p99'], action='append', type=str); 19 | parser.add_argument('-x', '--xlim', dest='xlim', type=float); 20 | parser.add_argument('-y', '--ylim', dest='ylim', type=float); 21 | 22 | args = parser.parse_args() 23 | 24 | yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4} 25 | 26 | num_inputs = len(args.input_paths) 27 | 28 | data = [np.genfromtxt(path, delimiter=',') for path in args.input_paths] 29 | 30 | #plt.rcParams.update({'font.size': 6}) 31 | plt.figure(figsize=(6.4, 3.9552)) 32 | 33 | for i in range(num_inputs): 34 | for line, name, fmt in zip(args.lines, args.names, fmts): 35 | for yaxis in args.yaxises: 36 | plt.plot(data[i][:, 0], data[i][:, line * 7 + yaxis_name2offset[yaxis] + 3] / 1000., fmt, label=name, linewidth=1, markersize=2) 37 | 38 | plt.gca().invert_xaxis() 39 | plt.gca().set_xlim(500, 0) 40 | #plt.xlim(500, 0) 41 | plt.xlabel('Less Fair <- Fairness Threshold -> More Fair') 42 | if len(args.yaxises) == 1: 43 | plt.ylabel(args.yaxises[0] + ' Latency (ms)') 44 | else: 45 | plt.ylabel('Latency (ms)') 46 | 47 | plt.legend() 48 | #if args.xlim is not None: 49 | # plt.xlim(0, args.xlim) 50 | #if args.ylim is not None: 51 | # plt.ylim(0, args.ylim) 52 | 53 | plt.savefig(args.output_path, bbox_inches='tight') 54 | 55 | -------------------------------------------------------------------------------- /tools/plot_latency_throughput.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import sys 6 | import argparse 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument('-o', '--output_path', dest='output_path'); 12 | parser.add_argument('-i', '--input_path', dest='input_paths', action='append'); 13 | parser.add_argument('-a', '--algo_name', dest='algo_names', action='append'); 14 | parser.add_argument('-l', '--line', dest='lines', type=int, action='append'); 15 | parser.add_argument('-n', '--name', dest='names', action='append'); 16 | parser.add_argument('--xlim', dest='xlim', type=float); 17 | parser.add_argument('--ylim', dest='ylim', type=float); 18 | parser.add_argument('--xaxis', choices=['throughput', 'rate'], type=str); 19 | parser.add_argument('--yaxis', dest='yaxises', choices=['mean', 'p50', 'p90', 'p95', 'p99'], action='append', type=str); 20 | 21 | args = parser.parse_args() 22 | 23 | yaxis_name2offset = {'mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4} 24 | 25 | num_inputs = len(args.input_paths) 26 | 27 | data = [np.genfromtxt(path, delimiter=',') for path in args.input_paths] 28 | 29 | if args.xaxis == 'throughput': 30 | x_axis = [15000. / x[:, 1] * 1000000. for x in data] 31 | plt.xlabel('Throughput (req/s)') 32 | else: 33 | x_axis = [1000000. / x[:, 0] for x in data] 34 | plt.xlabel('Sending rate (req/s)') 35 | 36 | plt.ylabel('Latency (us)') 37 | 38 | for i in range(num_inputs): 39 | for line, name in zip(args.lines, args.names): 40 | for yaxis in args.yaxises: 41 | plt.errorbar(x_axis[i], data[i][:, line * 7 + yaxis_name2offset[yaxis] + 2], data[i][:, line * 7 + 6 + 2], label=args.algo_names[i] + ' ' + name + ' ' + yaxis, fmt='x-', linewidth=1, markersize=2, elinewidth=0) 42 | 43 | plt.legend() 44 | if args.xlim is not None: 45 | plt.xlim(0, args.xlim) 46 | if args.ylim is not None: 47 | plt.ylim(0, args.ylim) 48 | 49 | plt.savefig(args.output_path) 50 | 51 | -------------------------------------------------------------------------------- /tools/plot_resnet18_inception_v3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0")" 4 | script_path=`pwd` 5 | cd - 6 | 7 | python3 $script_path/plot_latency_throughput_subplots.py \ 8 | -o resnet18_inception_v3_prop_newmix3_lns2.pdf \ 9 | -i resnet18_inception_v3_prop_direct_singlestream_newmix3_3_lns2.txt \ 10 | -a 'CUDA-SS' \ 11 | -i resnet18_inception_v3_prop_direct_multistream_newmix3_1_lns2.txt \ 12 | -a 'CUDA-MS' \ 13 | -i resnet18_inception_v3_prop_mps_newmix3_4_lns2.txt \ 14 | -a 'MPS' \ 15 | -i resnet18_inception_v3_prop_singlestream_newmix3_no_mnist_fifo_2_lns2.txt \ 16 | -a 'Paella-SS' \ 17 | -i resnet18_inception_v3_prop_newmix3_fifo_1_lns2.txt \ 18 | -a 'Paella-MS-jbj' \ 19 | -i resnet18_inception_v3_prop_newmix3_fifo2_1_lns2.txt \ 20 | -a 'Paella-MS-kbk' \ 21 | -i resnet18_inception_v3_prop_newmix3_full3_1_lns2.txt \ 22 | -a 'Paella' \ 23 | -m 0 -n All \ 24 | -m 1 -n ResNet-18 \ 25 | -m 2 -n InceptionV3 \ 26 | --xaxis throughput \ 27 | --yaxis p99 \ 28 | --subplotx 1 \ 29 | --subploty 3 \ 30 | --no-xlabel \ 31 | --legend_subplot 2 \ 32 | --ylim 1000 \ 33 | --height 1.4 34 | #--height 1.6 35 | 36 | python3 $script_path/plot_latency_throughput_subplots.py \ 37 | -o resnet18_inception_v3_prop_newmix3_lns1.5.pdf \ 38 | -i resnet18_inception_v3_prop_direct_singlestream_newmix3_3_lns1.5.txt \ 39 | -a 'CUDA-SS' \ 40 | -i resnet18_inception_v3_prop_direct_multistream_newmix3_1_lns1.5.txt \ 41 | -a 'CUDA-MS' \ 42 | -i resnet18_inception_v3_prop_mps_newmix3_4_lns1.5.txt \ 43 | -a 'MPS' \ 44 | -i resnet18_inception_v3_prop_singlestream_newmix3_no_mnist_fifo_2_lns1.5.txt \ 45 | -a 'Paella-SS' \ 46 | -i resnet18_inception_v3_prop_newmix3_fifo_1_lns1.5.txt \ 47 | -a 'Paella-MS-jbj' \ 48 | -i resnet18_inception_v3_prop_newmix3_fifo2_1_lns1.5.txt \ 49 | -a 'Paella-MS-kbk' \ 50 | -i resnet18_inception_v3_prop_newmix3_full3_1_lns1.5.txt \ 51 | -a 'Paella' \ 52 | -m 0 -n All \ 53 | -m 1 -n ResNet-18 \ 54 | -m 2 -n InceptionV3 \ 55 | --xaxis throughput \ 56 | --yaxis p99 \ 57 | --subplotx 1 \ 58 | --subploty 3 \ 59 | --no-title \ 60 | --no-legend \ 61 | --ylim 1000 \ 62 | --height 1.4 63 | #--height 1.6 64 | 65 | -------------------------------------------------------------------------------- /tools/run_all_direct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | model_path=$2 5 | ln_sigma=$3 6 | suffix=$4 7 | 8 | cd "$(dirname "$0")"/.. 9 | abs_path="`pwd`" 10 | 11 | cd release/tests/simple 12 | 13 | echo "**** Running all-direct with ln_sigma=$ln_sigma, suffix=$suffix" 14 | 15 | for seed in {1,}; do 16 | #for i in {3000,6000,8000,10000,12000,14000,16000,18000,20000,22000,50000,100000,200000,500000}; do 17 | #for i in {3000,}; do 18 | #for i in {25000,50000,100000,200000,500000}; do 19 | #for i in {0,}; do 20 | for i in {143,154,167,182,200,222,250,286,333,400,500,667,1000,1053,1111,1176,1250,1333,1429,1538,1667,1818,2000}; do 21 | #ncu -f --set full --profile-from-start on -o "${res_dir}/all_equal_direct${suffix}.ncu" \ 22 | #nsys profile -o "${res_dir}/all_equal_direct${suffix}.nsys" \ 23 | ./test_tvm_direct_concurrent \ 24 | --iat $i \ 25 | --ln_sigma $ln_sigma \ 26 | --seed $seed \ 27 | --output_path "${res_dir}/all_equal_direct${suffix}.txt" \ 28 | --num_jobs 15000 \ 29 | --concurrency 60 \ 30 | "${model_path}/mnist-8-cuda-pack.so" 0.759 \ 31 | "${model_path}/mobilenetv2-7-cuda-pack.so" 0.0636 \ 32 | "${model_path}/densenet-9-cuda-pack.so" 0.024 \ 33 | "${model_path}/googlenet-9-cuda-pack.so" 0.00289 \ 34 | "${model_path}/inception_v3-cuda-pack.so" 0.00383 \ 35 | "${model_path}/resnet18-v2-7-cuda-pack.so" 0.0657 \ 36 | "${model_path}/resnet34-v2-7-cuda-pack.so" 0.0382 \ 37 | "${model_path}/resnet50-v2-7-cuda-pack.so" 0.0187 \ 38 | "${model_path}/squeezenet1.1-7-cuda-pack.so" 0.02408 39 | #--num_jobs 500 \ 40 | #--concurrency 15 \ 41 | #"${model_path}/mnist-8-cuda-pack.so" 0.112 \ 42 | #"${model_path}/mobilenetv2-7-cuda-pack.so" 0.111 \ 43 | #"${model_path}/densenet-9-cuda-pack.so" 0.111 \ 44 | #"${model_path}/googlenet-9-cuda-pack.so" 0.111 \ 45 | #"${model_path}/inception_v3-cuda-pack.so" 0.111 \ 46 | #"${model_path}/resnet18-v2-7-cuda-pack.so" 0.111 \ 47 | #"${model_path}/resnet34-v2-7-cuda-pack.so" 0.111 \ 48 | #"${model_path}/resnet50-v2-7-cuda-pack.so" 0.111 \ 49 | #"${model_path}/squeezenet1.1-7-cuda-pack.so" 0.111 50 | done 51 | done 52 | 53 | -------------------------------------------------------------------------------- /tools/run_fairness_dummy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | for seed in {1,}; do 17 | for i in {0,}; do 18 | #for f in {0,1,2,3,4,5,10,15,20,1000000}; do 19 | #for f in {100,1000,10000}; do 20 | for f in {2000,4000,6000,8000}; do 21 | taskset -c 4 ./server server $f 1 & 22 | SERVER_PID=$! 23 | sleep 5 24 | 25 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 26 | --server_name server \ 27 | --iat $i \ 28 | --ln_sigma $ln_sigma \ 29 | --concurrency 100 \ 30 | --num_jobs 3000 \ 31 | --start_record_num 0 \ 32 | --seed $seed \ 33 | --prefix "${res_dir}/dummy_fairness${suffix}" \ 34 | --fairness $f \ 35 | --iat_n \ 36 | --ln_sigma_n \ 37 | --fairness_n \ 38 | --fairness_g \ 39 | "${abs_path}/release/jobs/dummy_short/libjob_dummy_short.so" 0.7 50 \ 40 | "${abs_path}/release/jobs/dummy_long/libjob_dummy_long.so" 0.3 50 41 | wait 42 | done 43 | done 44 | done 45 | -------------------------------------------------------------------------------- /tools/run_fairness_mnist_inception_v3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | for seed in {1,}; do 17 | #for i in {8000,}; do 18 | for i in {0,}; do 19 | #for f in {0.003,0.03,0.3,3,10,15,20,25,30,60,300}; do 20 | #for f in {0.00003,0.0003}; do 21 | #for f in {0.03,0.3,3,30,300,3000,30000}; do 22 | #for f in {0.03,0.3,3,5,10,15,20,25,30,300,3000,10000,15000,20000,30000}; do 23 | for f in {0.03,30000,}; do 24 | #for f in {300,3000,30000}; do 25 | #for f in {10000,15000,20000}; do 26 | #for f in {0.03,}; do 27 | taskset -c 4 ./server server $f 1 & 28 | SERVER_PID=$! 29 | sleep 5 30 | 31 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 100 3000 0 mnist_inception_v3_0.7_0.3_i${i}_fairness${suffix}.txt tmp2.txt mnist_inception_v3_0.7_0.3_fair${f}${suffix}_profile_$i mnist_inception_v3_0.7_0.3_fair${f}${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 50 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.3 50 32 | wait 33 | done 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /tools/run_inception_v3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | for seed in {1,}; do 17 | #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000}; do 18 | #for i in {30000,}; do 19 | #for i in {0,}; do 20 | #for i in {30000,40000,50000,60000,80000,100000}; do 21 | for i in {120000,140000}; do 22 | taskset -c 4 ./server server 1000000 1 & 23 | SERVER_PID=$! 24 | sleep 5 25 | 26 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 50 3000 0 inception_v3${suffix}.txt tmp2.txt inception_v3${suffix}_profile_$i inception_v3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 50 27 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 28 | # --server_name server \ 29 | # --iat $i \ 30 | # --ln_sigma $ln_sigma \ 31 | # --concurrency 50 \ 32 | # --num_jobs 3000 \ 33 | # --start_record_num 0 \ 34 | # --seed $seed \ 35 | # --prefix inception_v3${suffix} \ 36 | # --fairness 1000000 \ 37 | # --iat_n \ 38 | # --iat_g \ 39 | # --ln_sigma_n \ 40 | # "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 50 41 | 42 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 1 3000 0 inception_v3${suffix}.txt tmp2.txt inception_v3${suffix}_profile_$i inception_v3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 1 43 | 44 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 45 | --server_name server \ 46 | --iat $i \ 47 | --ln_sigma $ln_sigma \ 48 | --concurrency 1 \ 49 | --num_jobs 3000 \ 50 | --start_record_num 0 \ 51 | --seed $seed \ 52 | --prefix "${res_dir}/inception_v3${suffix}" \ 53 | --fairness 1000000 \ 54 | --iat_n \ 55 | --iat_g \ 56 | --ln_sigma_n \ 57 | --concurrency_n \ 58 | "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 1 59 | wait 60 | done 61 | done 62 | -------------------------------------------------------------------------------- /tools/run_mnist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | echo "**** Running mnist with ln_sigma=$ln_sigma, suffix=$suffix" 17 | 18 | for seed in {1,}; do 19 | #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000}; do 20 | #for i in {200,}; do 21 | for i in {0,}; do 22 | taskset -c 4 ./server server 1000000 1 & 23 | sleep 5 24 | 25 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 50 3000 0 mnist${suffix}.txt tmp2.txt mnist${suffix}_profile_$i mnist${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 50 26 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 3 3000 0 mnist${suffix}.txt tmp2.txt mnist${suffix}_profile_$i mnist${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 3 27 | 28 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 29 | --server_name server \ 30 | --iat $i \ 31 | --ln_sigma $ln_sigma \ 32 | --start_record_num 0 \ 33 | --seed $seed \ 34 | --prefix "${res_dir}/mnist${suffix}" \ 35 | --fairness 1000000 \ 36 | --iat_n \ 37 | --iat_g \ 38 | --ln_sigma_n \ 39 | --num_jobs 15000 \ 40 | --concurrency 641 \ 41 | "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 641 42 | #--concurrency 1 \ 43 | #"${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 1 44 | wait 45 | done 46 | done 47 | -------------------------------------------------------------------------------- /tools/run_mnist_googlenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | echo "**** Running all with ln_sigma=$ln_sigma, suffix=$suffix" 17 | 18 | for seed in {1,}; do 19 | #for i in {3000,6000,8000,10000,12000,14000,16000,18000,20000,22000,50000,100000,200000,500000}; do 20 | for i in {40000,50000,100000,250000,500000}; do 21 | taskset -c 4 ./server server 1000000 1 & 22 | SERVER_PID=$! 23 | sleep 5 24 | 25 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 15 3000 0 all_equal${suffix}.txt tmp2.txt all_equal${suffix}_profile_$i all_equal${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.125 15 "${abs_path}/release/jobs/tvm_ultraface320/libjob_tvm_ultraface320.so" 0.125 15 "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_densenet121/libjob_tvm_densenet121.so" 0.125 15 "${abs_path}/release/jobs/tvm_resnet50/libjob_tvm_resnet50.so" 0.125 15 "${abs_path}/release/jobs/tvm_googlenet/libjob_tvm_googlenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_arcfaceresnet100/libjob_tvm_arcfaceresnet100.so" 0.125 15 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.125 15 26 | 27 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 28 | --server_name server \ 29 | --iat $i \ 30 | --ln_sigma $ln_sigma \ 31 | --concurrency 1 \ 32 | --num_jobs 1000 \ 33 | --start_record_num 0 \ 34 | --seed $seed \ 35 | --prefix "${res_dir}/mnist_googlenet_0.7_0.3${suffix}" \ 36 | --fairness 1000000 \ 37 | --iat_n \ 38 | --iat_g \ 39 | --ln_sigma_n \ 40 | "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 1 \ 41 | "${abs_path}/release/jobs/tvm_googlenet/libjob_tvm_googlenet.so" 0.3 1 42 | wait 43 | done 44 | done 45 | -------------------------------------------------------------------------------- /tools/run_mnist_inception_v3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd .. 8 | abs_path="`pwd`" 9 | cd - 10 | 11 | cd ../release/src/server 12 | 13 | SERVER_PID=0 14 | 15 | trap "kill $SERVER_PID; exit" INT 16 | 17 | echo "**** Running mnist_inception_v3 with ln_sigma=$ln_sigma, suffix=$suffix" 18 | 19 | for seed in {1,}; do 20 | #for i in {8000,10000,12000,14000,16000,18000,20000}; do 21 | for i in {25000,33000,50000,100000}; do 22 | taskset -c 4 ./server server 1000000 1 & 23 | SERVER_PID=$! 24 | sleep 5 25 | 26 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 50 3000 0 mnist_inception_v3_0.7_0.3${suffix}.txt tmp2.txt mnist_inception_v3_0.7_0.3${suffix}_profile_$i mnist_inception_v3_0.7_0.3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 50 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.3 50 27 | wait 28 | done 29 | done 30 | -------------------------------------------------------------------------------- /tools/run_mnist_resnet50.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | echo "**** Running all with ln_sigma=$ln_sigma, suffix=$suffix" 17 | 18 | for seed in {1,}; do 19 | #for i in {250,500,1000,1400,1600,1800,2000,2500,3000}; do 20 | for i in {2000,2500,3000}; do 21 | taskset -c 4 ./server server 1000000 1 & 22 | SERVER_PID=$! 23 | sleep 5 24 | 25 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 15 3000 0 all_equal${suffix}.txt tmp2.txt all_equal${suffix}_profile_$i all_equal${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.125 15 "${abs_path}/release/jobs/tvm_ultraface320/libjob_tvm_ultraface320.so" 0.125 15 "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_densenet121/libjob_tvm_densenet121.so" 0.125 15 "${abs_path}/release/jobs/tvm_resnet50/libjob_tvm_resnet50.so" 0.125 15 "${abs_path}/release/jobs/tvm_googlenet/libjob_tvm_googlenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_arcfaceresnet100/libjob_tvm_arcfaceresnet100.so" 0.125 15 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.125 15 26 | 27 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 28 | --server_name server \ 29 | --iat $i \ 30 | --ln_sigma $ln_sigma \ 31 | --concurrency 50 \ 32 | --num_jobs 3000 \ 33 | --start_record_num 0 \ 34 | --seed $seed \ 35 | --prefix "${res_dir}/mnist_resnet50_0.7_0.3${suffix}" \ 36 | --fairness 1000000 \ 37 | --iat_n \ 38 | --iat_g \ 39 | --ln_sigma_n \ 40 | "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 50 \ 41 | "${abs_path}/release/jobs/tvm_resnet50/libjob_tvm_resnet50.so" 0.3 50 42 | wait 43 | done 44 | done 45 | -------------------------------------------------------------------------------- /tools/run_mnist_sched_sleep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | echo "**** Running mnist with ln_sigma=$ln_sigma, suffix=$suffix" 17 | 18 | for seed in {1,}; do 19 | for i in {0,}; do 20 | #for s in {0,10,100,1000,10000,}; do # 0,0.05,0.4,3,30 us 21 | #for s in {100000,1000000}; do # ? us 22 | for s in {0,}; do 23 | taskset -c 4 ./server server 1000000 1 $s & 24 | SERVER_PID=$! 25 | sleep 5 26 | 27 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 1 3000 0 mnist_sched_sleep${suffix}.txt tmp2.txt mnist_sched_sleep${s}${suffix}_profile_$i mnist_sched_sleep${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 1 28 | 29 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 30 | --server_name server \ 31 | --iat $i \ 32 | --ln_sigma $ln_sigma \ 33 | --concurrency 1 \ 34 | --num_jobs 3000 \ 35 | --start_record_num 0 \ 36 | --seed $seed \ 37 | --prefix "${res_dir}/mnist_sched_sleep${suffix}" \ 38 | --fairness 1000000 \ 39 | --sched_sleep $s \ 40 | --sched_sleep_n \ 41 | --sched_sleep_g \ 42 | "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 1 43 | wait 44 | done 45 | done 46 | done 47 | -------------------------------------------------------------------------------- /tools/run_mobilenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | for seed in {1,}; do 17 | #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000}; do 18 | for i in {0,}; do 19 | #taskset -c 4 \ 20 | # ncu -f --set full --profile-from-start off -o "${res_dir}/mobilenet${suffix}_lns${ln_sigma}_con1.ncu" \ 21 | ./server server 1000000 1 & 22 | SERVER_PID=$! 23 | sleep 5 24 | 25 | #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i 50 3000 1000 mobilenet${suffix}.txt tmp2.txt mobilenet${suffix}_profile_$i mobilenet${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 1 50 26 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 27 | --server_name server \ 28 | --iat $i \ 29 | --ln_sigma $ln_sigma \ 30 | --concurrency 1 \ 31 | --num_jobs 3000 \ 32 | --start_record_num 0 \ 33 | --seed $seed \ 34 | --prefix "${res_dir}/mobilenet${suffix}" \ 35 | --fairness 1000000 \ 36 | --iat_n \ 37 | --iat_g \ 38 | --ln_sigma_n \ 39 | --concurrency_n \ 40 | "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 1 1 41 | wait 42 | done 43 | done 44 | -------------------------------------------------------------------------------- /tools/run_mobilenet_inception_v3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd "$(dirname "$0")"/.. 8 | abs_path="`pwd`" 9 | 10 | cd release/src/server 11 | 12 | SERVER_PID=0 13 | 14 | trap "kill $SERVER_PID; exit" INT 15 | 16 | echo "**** Running mobilenet and inception_v3 with ln_sigma=$ln_sigma, suffix=$suffix" 17 | 18 | for seed in {1,}; do 19 | #for i in {3000,6000,8000,10000,12000,14000,16000,18000,20000,22000,50000,100000,200000,500000}; do 20 | for i in {3000,10000,18000,22000,50000,100000,500000}; do 21 | taskset -c 4 ./server server 1000000 1 & 22 | SERVER_PID=$! 23 | sleep 5 24 | 25 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \ 26 | --server_name server \ 27 | --iat $i \ 28 | --ln_sigma $ln_sigma \ 29 | --concurrency 1 \ 30 | --num_jobs 1000 \ 31 | --start_record_num 0 \ 32 | --seed $seed \ 33 | --prefix "${res_dir}/mobilenet_inception_v3_0.7_0.3${suffix}" \ 34 | --fairness 1000000 \ 35 | --iat_n \ 36 | --iat_g \ 37 | --ln_sigma_n \ 38 | "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 0.7 1 \ 39 | "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.3 1 40 | wait 41 | done 42 | done 43 | -------------------------------------------------------------------------------- /tools/run_ultraface_arcface.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | res_dir=$1 4 | ln_sigma=$2 5 | suffix=$3 6 | 7 | cd .. 8 | abs_path="`pwd`" 9 | cd - 10 | 11 | cd ../release/src/server 12 | 13 | SERVER_PID=0 14 | 15 | trap "kill $SERVER_PID; exit" INT 16 | 17 | echo "**** Running ultraface_arcface with ln_sigma=$ln_sigma, suffix=$suffix" 18 | 19 | for seed in {1,}; do 20 | #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000,4200,4400,4600,4800,5000,5200,5400,5600}; do 21 | #for i in {17000,20000,25000,33000,50000,100000}; do 22 | for i in {25000,50000,100000}; do 23 | taskset -c 4 ./server server 1000000 1 & 24 | SERVER_PID=$! 25 | sleep 5 26 | 27 | ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 30 3000 0 ultraface_arcface_0.7_0.3${suffix}.txt tmp2.txt ultraface_arcface_0.7_0.3${suffix}_profile_$i ultraface_arcface_0.7_0.3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_ultraface320/libjob_tvm_ultraface320.so" 0.7 30 "${abs_path}/release/jobs/tvm_arcfaceresnet100/libjob_tvm_arcfaceresnet100.so" 0.3 30 28 | wait 29 | done 30 | done 31 | --------------------------------------------------------------------------------