├── CMakeLists.txt
├── LICENSE
├── README.md
├── app
    ├── CMakeLists.txt
    └── client.cpp
├── experiments
    ├── fig2
    │   ├── Makefile
    │   ├── README.md
    │   ├── hol.cu
    │   ├── plot.py
    │   ├── plot_qlen.py
    │   ├── plot_ts.py
    │   └── run.py
    └── triton
    │   ├── README.md
    │   └── server.sh
├── include
    └── llis
    │   ├── client
    │       ├── client.h
    │       ├── io_shm_entry.h
    │       ├── job_instance_ref.h
    │       ├── job_ref.h
    │       └── profiler_client.h
    │   ├── ipc
    │       ├── atomic_lock.h
    │       ├── atomic_wrapper.h
    │       ├── defs.h
    │       ├── name_format.h
    │       ├── shm_channel.h
    │       ├── shm_channel_impl.h
    │       ├── shm_primitive_channel.h
    │       ├── shm_primitive_channel_impl.h
    │       ├── threadfence_wrapper.h
    │       └── unix_datagram_socket.h
    │   ├── job
    │       ├── context.h
    │       ├── coroutine_job.h
    │       ├── finished_block_notifier.h
    │       ├── instrument.h
    │       ├── instrument_info.h
    │       ├── job.h
    │       └── utils.h
    │   ├── server
    │       ├── client_connection.h
    │       ├── gpu_resources.h
    │       ├── profiler.h
    │       ├── registered_job.h
    │       ├── scheduler.h
    │       ├── scheduler_fifo.h
    │       ├── scheduler_fifo2.h
    │       ├── scheduler_full.h
    │       ├── scheduler_full2.h
    │       ├── scheduler_full3.h
    │       ├── server.h
    │       └── sm_resources.h
    │   └── utils
    │       ├── align.h
    │       ├── error.h
    │       ├── gpu.h
    │       ├── logging.hh
    │       ├── ops.h
    │       ├── path.h
    │       └── time.hh
├── jobs
    ├── CMakeLists.txt
    ├── cnn
    │   ├── CMakeLists.txt
    │   ├── layer.cu
    │   ├── layer.h
    │   ├── main.cu
    │   └── mnist.h
    ├── dummy_10
    │   ├── CMakeLists.txt
    │   └── dummy.cu
    ├── dummy_11
    │   ├── CMakeLists.txt
    │   └── dummy.cu
    ├── dummy_20
    │   ├── CMakeLists.txt
    │   └── dummy.cu
    ├── dummy_21
    │   ├── CMakeLists.txt
    │   └── dummy.cu
    ├── dummy_long
    │   ├── CMakeLists.txt
    │   └── dummy_long.cu
    ├── dummy_short
    │   ├── CMakeLists.txt
    │   └── dummy_short.cu
    ├── helloworld
    │   ├── CMakeLists.txt
    │   └── helloworld.cu
    ├── helloworld_coroutine
    │   ├── CMakeLists.txt
    │   └── helloworld_coroutine.cu
    ├── run_forever
    │   ├── CMakeLists.txt
    │   └── run_forever.cu
    ├── tvm_arcfaceresnet100
    │   ├── CMakeLists.txt
    │   └── tvm_arcfaceresnet100.cpp
    ├── tvm_densenet121
    │   ├── CMakeLists.txt
    │   └── tvm_densenet121.cpp
    ├── tvm_googlenet
    │   ├── CMakeLists.txt
    │   └── tvm_googlenet.cpp
    ├── tvm_inception_v3
    │   ├── CMakeLists.txt
    │   └── tvm_inception_v3.cpp
    ├── tvm_mnist
    │   ├── CMakeLists.txt
    │   └── tvm_mnist.cpp
    ├── tvm_mobilenet
    │   ├── CMakeLists.txt
    │   └── tvm_mobilenet.cpp
    ├── tvm_resnet18
    │   ├── CMakeLists.txt
    │   └── tvm_resnet18.cpp
    ├── tvm_resnet34
    │   ├── CMakeLists.txt
    │   └── tvm_resnet34.cpp
    ├── tvm_resnet50
    │   ├── CMakeLists.txt
    │   └── tvm_resnet50.cpp
    ├── tvm_squeezenet1_1
    │   ├── CMakeLists.txt
    │   └── tvm_squeezenet1_1.cpp
    ├── tvm_ultraface320
    │   ├── CMakeLists.txt
    │   └── tvm_ultraface320.cpp
    └── vec_add_coroutine
    │   ├── CMakeLists.txt
    │   └── vec_add_coroutine.cu
├── sosp23_artifact
    ├── README.md
    ├── gen_data_fig11_cuda.sh
    ├── gen_data_fig11_paella.sh
    ├── gen_data_fig11_triton.sh
    ├── gen_data_fig12_cuda.sh
    ├── gen_data_fig12_mps.sh
    ├── gen_data_fig12_paella.sh
    ├── gen_data_fig13.sh
    ├── plot_fig11.sh
    ├── plot_fig12.sh
    ├── plot_fig13.sh
    ├── setup
    │   ├── README.md
    │   ├── build_triton_docker.sh
    │   ├── dso_to_tf.py
    │   ├── install_dependencies.sh
    │   ├── install_llis_tvm.sh
    │   ├── install_triton_client.sh
    │   ├── onnx2tvm.py
    │   ├── onnx2tvm_all.sh
    │   ├── reset_all.sh
    │   ├── reset_llis_tvm.sh
    │   └── triton_docker
    │   │   ├── Dockerfile
    │   │   ├── build_tvm_tf.sh
    │   │   ├── convert_tvm_to_tf.sh
    │   │   ├── run_on_tf_docker.sh
    │   │   └── setup.sh
    ├── tf_models_config
    │   ├── densenet-9
    │   │   └── config.pbtxt
    │   ├── googlenet-9
    │   │   └── config.pbtxt
    │   ├── inception_v3
    │   │   └── config.pbtxt
    │   ├── mobilenetv2-7
    │   │   └── config.pbtxt
    │   ├── resnet18-v2-7
    │   │   └── config.pbtxt
    │   ├── resnet34-v2-7
    │   │   └── config.pbtxt
    │   ├── resnet50-v2-7
    │   │   └── config.pbtxt
    │   └── squeezenet1.1-7
    │   │   └── config.pbtxt
    ├── tools
    │   ├── merge_mps_results.py
    │   ├── parse_input_kelvin.py
    │   ├── parse_triton.py
    │   ├── plot_latency_fairness_threshold.py
    │   └── plot_latency_throughput_subplots.py
    ├── triton_server_launch.sh
    └── tvm_models_dim
    │   ├── densenet-9-cuda-pack.so.dim
    │   ├── googlenet-9-cuda-pack.so.dim
    │   ├── inception_v3-cuda-pack.so.dim
    │   ├── mnist-8-cuda-pack.so.dim
    │   ├── mobilenetv2-7-cuda-pack.so.dim
    │   ├── resnet18-v2-7-cuda-pack.so.dim
    │   ├── resnet34-v2-7-cuda-pack.so.dim
    │   ├── resnet50-v2-7-cuda-pack.so.dim
    │   └── squeezenet1.1-7-cuda-pack.so.dim
├── src
    ├── CMakeLists.txt
    ├── client
    │   ├── CMakeLists.txt
    │   ├── client.cpp
    │   ├── job_instance_ref.cpp
    │   ├── job_ref.cpp
    │   └── profiler_client.cpp
    ├── ipc
    │   ├── CMakeLists.txt
    │   ├── name_format.cpp
    │   ├── shm_channel.cu
    │   ├── shm_primitive_channel.cu
    │   └── unix_datagram_socket.cpp
    ├── job
    │   ├── CMakeLists.txt
    │   ├── context.cpp
    │   ├── finished_block_notifier.cu
    │   └── utils.cu
    └── server
    │   ├── CMakeLists.txt
    │   ├── client_connection.cpp
    │   ├── gpu_resources.cpp
    │   ├── profiler.cpp
    │   ├── registered_job.cpp
    │   ├── scheduler.cpp
    │   ├── scheduler_fifo.cpp
    │   ├── scheduler_fifo2.cpp
    │   ├── scheduler_full.cpp
    │   ├── scheduler_full2.cpp
    │   ├── scheduler_full3.cpp
    │   ├── server.cpp
    │   └── sm_resources.cpp
├── tests
    ├── CMakeLists.txt
    ├── client
    │   ├── CMakeLists.txt
    │   ├── client.cpp
    │   ├── client_concurrent_run_latencies.cpp
    │   ├── client_concurrent_run_latencies_set_load.cpp
    │   ├── client_concurrent_run_latencies_set_load_multi.cpp
    │   ├── client_concurrent_runs.cpp
    │   ├── client_single_latency.cpp
    │   └── raw_kernel_launch.cu
    ├── ipc
    │   ├── CMakeLists.txt
    │   └── shm_channel
    │   │   ├── CMakeLists.txt
    │   │   ├── latency
    │   │       ├── CMakeLists.txt
    │   │       ├── shmc_latency_read.cpp
    │   │       ├── shmc_latency_read_bare_atomic.cpp
    │   │       ├── shmc_latency_read_bare_atomic_loop.cpp
    │   │       ├── shmc_latency_read_loop.cpp
    │   │       ├── shmc_latency_write.cpp
    │   │       ├── shmc_latency_write_bare_atomic.cpp
    │   │       ├── shmc_latency_write_bare_atomic_loop.cpp
    │   │       ├── shmc_latency_write_loop.cpp
    │   │       └── shmpc_latency_gpu.cu
    │   │   ├── shmc_read.cpp
    │   │   ├── shmc_read_write.cpp
    │   │   ├── shmc_read_write_cpu_gpu.cu
    │   │   ├── shmc_read_write_same_proc.cpp
    │   │   └── shmc_write.cpp
    ├── simple
    │   ├── CMakeLists.txt
    │   ├── cuda_callback_benchmark.cu
    │   ├── cuda_sync_benchmark.cu
    │   ├── mmap_mlock_limit.cpp
    │   ├── tvm_direct_concurrent.cpp
    │   └── tvm_direct_multistream.cpp
    └── utils
    │   ├── CMakeLists.txt
    │   └── workload_pregen.cpp
└── tools
    ├── calculate_jains_fairness_index.py
    ├── calculate_overhead_stack.py
    ├── cloudlab_setup.sh
    ├── parse_clockwork.py
    ├── parse_input_kelvin.py
    ├── parse_triton.py
    ├── plot_all_no_mnist.sh
    ├── plot_block_exec_times_cdf.py
    ├── plot_latency_fairness_threshold.py
    ├── plot_latency_throughput.py
    ├── plot_latency_throughput_subplots.py
    ├── plot_overhead_stack_graph.py
    ├── plot_resnet18_inception_v3.sh
    ├── run_all.sh
    ├── run_all_direct.sh
    ├── run_all_direct_multistream.sh
    ├── run_fairness_dummy.sh
    ├── run_fairness_mnist_inception_v3.sh
    ├── run_inception_v3.sh
    ├── run_mnist.sh
    ├── run_mnist_googlenet.sh
    ├── run_mnist_inception_v3.sh
    ├── run_mnist_resnet50.sh
    ├── run_mnist_sched_sleep.sh
    ├── run_mobilenet.sh
    ├── run_mobilenet_inception_v3.sh
    └── run_ultraface_arcface.sh


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(llis LANGUAGES CXX CUDA)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | cmake_policy(SET CMP0105 NEW)
 8 | 
 9 | find_package(CUDAToolkit)
10 | 
11 | include_directories(${CUDAToolkit_INCLUDE_DIRS})
12 | 
13 | find_package(Boost REQUIRED system context program_options)
14 | 
15 | find_package(spdlog REQUIRED)
16 | 
17 | option(TVM_INCLUDE_DIR "Headers directory of TVM")
18 | 
19 | find_package(tvm)
20 | if(tvm_FOUND)
21 |     if(TVM_INCLUDE_DIR)
22 |         include_directories(PUBLIC ${TVM_INCLUDE_DIR})
23 |     endif(TVM_INCLUDE_DIR)
24 | endif(tvm_FOUND)
25 | 
26 | option(MEASURE_BLOCK_TIME "Enable measurement of block time" OFF)
27 | if(MEASURE_BLOCK_TIME)
28 |     add_definitions(-DLLIS_MEASURE_BLOCK_TIME)
29 | endif(MEASURE_BLOCK_TIME)
30 | 
31 | option(FINISHED_BLOCK_NOTIFICATION_AGG "Aggregate block notifications" ON)
32 | if(FINISHED_BLOCK_NOTIFICATION_AGG)
33 |     add_definitions(-DLLIS_FINISHED_BLOCK_NOTIFICATION_AGG)
34 | endif(FINISHED_BLOCK_NOTIFICATION_AGG)
35 | 
36 | option(ENABLE_PROFILER "Enable profiler" OFF)
37 | if(ENABLE_PROFILER)
38 |     add_definitions(-DLLIS_ENABLE_PROFILER)
39 | endif(ENABLE_PROFILER)
40 | 
41 | option(PRINT_LAUNCH_JOB_IPC_LATENCY "Print launch job IPC latency" OFF)
42 | if(PRINT_LAUNCH_JOB_IPC_LATENCY)
43 |     add_definitions(-DPRINT_LAUNCH_JOB_IPC_LATENCY)
44 | endif(PRINT_LAUNCH_JOB_IPC_LATENCY)
45 | 
46 | set(CMAKE_INSTALL_RPATH $ORIGIN $ORIGIN/../lib)
47 | 
48 | include_directories(include)
49 | 
50 | add_subdirectory(src)
51 | add_subdirectory(app)
52 | add_subdirectory(tests)
53 | add_subdirectory(jobs)
54 | 
55 | install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/ DESTINATION include)
56 | 
57 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 University of Pennsylvania | Distributed Systems Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Paella / LLIS
 2 | 
 3 | This project was called LLIS at the very beginning, and so this name is used in the codebase.
 4 | 
 5 | ## SOSP 2023 Artifact Evaluation
 6 | 
 7 | Please refer to the [instructions](sosp23_artifact/README.md) in the `sosp23_artifact/` directory.
 8 | 
 9 | ## Dependencies
10 | 
11 | 1. Linux (tested on Ubuntu 22.04)
12 | 1. NVIDIA driver (tested on 535.54.03)
13 | 1. CUDA runtime (tested on 12.2.0)
14 | 1. GCC (tested on 11.3.0)
15 | 1. CMake (tested on 3.22.1)
16 | 1. Boost (tested on 1.82.0)
17 | 1. LLVM / Clang (tested on 14)
18 | 1. spdlog (tested on 1.11.0; 1.12.0 is known to not work)
19 | 1. [**tvm-llis**](https://github.com/eniac/tvm-llis) (Custom version of TVM modified to work with Paella)
20 | 
21 | ## Installation
22 | 
23 | ### Paella/LLIS server and libraries
24 | 
25 | ```
26 | mkdir build
27 | cd build
28 | cmake -DCMAKE_BUILD_TYPE=<release|debug> -DCMAKE_CUDA_ARCHITECTURES=<cuda_arch> .. # cuda_arch is 60 for 6.0, 75 for 7.5, etc
29 | make -j$(nproc) install
30 | ```
31 | 
32 | ### Custom TVM (tvm-llis)
33 | 
34 | Custom TVM depends on the libraries of Paella/LLIS. So, it can only be built after doing the previous step.
35 | 
36 | Please refer to [README-llis.md](https://github.com/eniac/tvm-llis/blob/v0.10.0-llis/README-llis.md) of [tvm-llis](https://github.com/eniac/tvm-llis) for instructions.
37 | 
38 | ### Paella/LLIS applications (e.g., client) and job adapters
39 | 
40 | Applications and job adapters depend on the custom TVM. So, they can only be built after doing the previous step.
41 | 
42 | ```
43 | cmake .. -Utvm_FOUND # Find TVM again after we have installed it
44 | make -j$(nproc) install
45 | ```
46 | 
47 | 


--------------------------------------------------------------------------------
/app/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(llis_app_client client.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
2 | target_link_libraries(llis_app_client spdlog::spdlog dl rt)
3 | install(TARGETS llis_app_client DESTINATION bin)
4 | 


--------------------------------------------------------------------------------
/experiments/fig2/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	nvcc hol.cu -o fig2 -O3 -arch=sm_75 -Xptxas=-v
3 | 
4 | reveng: reverse_eng.cu
5 | 	nvcc reverse_eng.cu -o reveng -O3 -arch=sm_75 -Xptxas=-v
6 | 


--------------------------------------------------------------------------------
/experiments/fig2/plot_qlen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import os
 3 | import re
 4 | import numpy as np
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | import argparse
 8 | 
 9 | expected_jct = 316 * 8 # in ns
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('exp_labels', nargs='+', type=str)
13 | args = parser.parse_args()
14 | 
15 | for exp_label in args.exp_labels:
16 |     print(f'Plotting data for {exp_label}')
17 |     re_string = f"^{exp_label}-\d+-qlen-results.csv$"
18 |     valid = re.compile(re_string)
19 |     result_files = [f for f in os.listdir(os.curdir) if os.path.isfile(f) and re.match(valid, f)]
20 |     print(result_files)
21 |     for f in result_files:
22 |         df = pd.read_csv(f, delimiter='\t')
23 |         plt.plot(df.TIME, df.QLEN, label=f.split('qlen')[0].split('-')[-2])
24 | 
25 | 
26 | plt.legend()
27 | plt.xlabel('Sending time (seconds)')
28 | plt.ylabel('Queue length')
29 | 
30 | fname = f"{'-'.join(args.exp_labels)}-qlen.pdf"
31 | print(f'Storing plot in {fname}')
32 | plt.savefig(fname)
33 | 


--------------------------------------------------------------------------------
/experiments/fig2/plot_ts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import os
 4 | import re
 5 | import numpy as np
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | import argparse
 9 | 
10 | pd.set_option('display.max_rows', 500)
11 | 
12 | expected_jct = 316 * 8 # in us
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('csvfile', type=str)
16 | parser.add_argument('-i', '--ideal-jct', type=int, help='Ideal JCT in us', default=expected_jct)
17 | args = parser.parse_args()
18 | 
19 | df = pd.read_csv(args.csvfile, delimiter='\t')
20 | print(df)
21 | 
22 | plt.plot(df.index, df.JCT, '.', label='jobs latency')
23 | 
24 | if args.ideal_jct > 0:
25 |     plt.plot(df.index, [args.ideal_jct for i in range(df.shape[0])], label='ideal job latency')
26 | 
27 | plt.legend()
28 | plt.ylim(0)
29 | plt.xlabel('Job index')
30 | plt.ylabel('Latency (us)')
31 | fname = f'{args.csvfile}.pdf'
32 | print(f'Storing plot in {fname}')
33 | plt.savefig(fname)
34 | 


--------------------------------------------------------------------------------
/experiments/fig2/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import os
 4 | import subprocess
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('exp_label', type=str)
 9 | parser.add_argument('mode', type=str)
10 | parser.add_argument('--num_hwq', type=str, default=32)
11 | parser.add_argument('--iterate-hwq', action='store_true', default=False)
12 | args = parser.parse_args()
13 | 
14 | def run_over_load(n_hwq: int):
15 |     for i in range(100, 2000, 200):
16 |         label = f'{args.exp_label}-{n_hwq}hwq'
17 |         cmd_args = ['./fig2', args.mode, label, str(1e9/i)] # interval in ns
18 |         p = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=dict(os.environ, CUDA_DEVICE_MAX_CONNECTIONS=str(n_hwq)))
19 |         while(1):
20 |             line = p.stdout.readline()
21 |             print(line.decode('ascii'))
22 |             if not line:
23 |                 break
24 |         p.wait()
25 | 
26 | if (args.iterate_hwq):
27 |     n_hwq = 1
28 |     while n_hwq <= 32:
29 |         run_over_load(n_hwq)
30 |         n_hwq *= 2
31 | else:
32 |     run_over_load(args.num_hwq)
33 | 


--------------------------------------------------------------------------------
/experiments/triton/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | We use triton 23:03 to run our experiments. We configure it to use our TVM, a dependency for our models.
 4 | We modify triton existing sample clients to use the same load generation logic we use across the paper.
 5 | 
 6 | dependencies
 7 | ---
 8 | - Triton and our models need the same TVM
 9 | - Triton and TVM need the same tensorflow
10 | 
11 | # Server
12 | 
13 | Run `./server.sh [models path] [TVM lib path] [ALLIS lib path] [CMake path]`. This is the command we ran for our experiments:
14 | ```
15 | ./server.sh /home/maxdml/allis/models /home/maxdml/tvm_tf/ /home/kelvin/opt/cmake-3.22.3/ /home/kelvin/llis/ /home/kelvin/opt/boost-1.74.0/
16 | ```
17 | 
18 | In the container, run:
19 | ```
20 | LD_PRELOAD="/opt/tvm/build/libtvm_dso_op.so /opt/tritonserver/backends/tensorflow2/libtensorflow_cc.so /opt/tritonserver/backends/tensorflow2/libtensorflow_framework.so" tritonserver --model-repository=/models/newmix3/tensorflow --backend-config=tensorflow,version=2 --min-supported-compute-capability=7.5 --allow-grpc=true --backend-config=default-max-batch-size=0
21 | ```
22 | 
23 | # Client
24 | 
25 | We use a custom client using triton's client framework.
26 | 
27 | Build
28 | ---
29 | ```
30 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON ..
31 | ```
32 | 
33 | Running
34 | ---
35 | - We use run.py to run experiments
36 | - run.py takes a config file describing the workload
37 | 


--------------------------------------------------------------------------------
/experiments/triton/server.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -ex
 2 | 
 3 | MODELS_PATH=$1 # path to TF-wrapped models
 4 | LIBTVM_PATH=$2 # path to our TVM
 5 | CMAKE_PATH=$3 # path to our cmake
 6 | LLIS_PATH=$4 # path to ALLIS libraries
 7 | LIBBOOST_PATH=$5
 8 | 
 9 | docker run -it --gpus=1 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p8000:8000 -p8001:8001 -p8002:8002 -v${MODELS_PATH}:/models -v${LIBTVM_PATH}:/opt/tvm -v${CMAKE_PATH}:/opt/cmake -v${LLIS_PATH}:/opt/allis -v${LIBBOOST_PATH}:/opt/boost nvcr.io/nvidia/tritonserver:latest
10 | 


--------------------------------------------------------------------------------
/include/llis/client/client.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/client/job_ref.h>
 4 | #include <llis/client/profiler_client.h>
 5 | #include <llis/ipc/shm_channel.h>
 6 | #include <llis/ipc/unix_datagram_socket.h>
 7 | #include <llis/ipc/defs.h>
 8 | 
 9 | #include <cstdint>
10 | #include <string>
11 | #include <mutex>
12 | 
13 | namespace llis {
14 | namespace client {
15 | 
16 | class Client {
17 |   public:
18 |     Client(std::string server_name);
19 |     ~Client();
20 | 
21 |     JobRef register_job(std::string path);
22 | 
23 |     ClientId get_client_id() const {
24 |         return client_id_;
25 |     }
26 | 
27 |     ipc::ShmChannelCpuWriter* get_c2s_channel() {
28 |         return &c2s_channel_;
29 |     }
30 | 
31 |     ipc::ShmChannelCpuReader* get_s2c_channel() {
32 |         return &s2c_channel_;
33 |     }
34 | 
35 |     void cuda_profiler_start();
36 |     void cuda_profiler_stop();
37 | 
38 |     JobInstanceRef* add_job_instance_ref(JobInstanceRef job_instance_ref);
39 |     void release_job_instance_ref(JobInstanceRef* job_instance_ref);
40 | 
41 |     JobInstanceRef* wait();
42 | 
43 |     ProfilerClient* get_profiler_client() {
44 |         return &profiler_client_;
45 |     }
46 | 
47 |     void kill_server();
48 | 
49 |   private:
50 |     void generate_client_id();
51 |     void create_s2c_channel();
52 |     void reconnect_s2c_channel();
53 |     void register_client();
54 |     void connect_s2c_socket();
55 | 
56 |     std::string server_name_;
57 | 
58 |     ClientId client_id_;
59 | 
60 |     std::string s2c_socket_prefix_;
61 | 
62 |     ipc::ShmChannelCpuWriter c2s_channel_;
63 |     ipc::ShmChannelCpuReader s2c_channel_;
64 |     ipc::UnixDatagramSocket s2c_socket_;
65 | 
66 |     std::vector<std::unique_ptr<JobInstanceRef>> job_instance_refs_;
67 |     std::vector<JobInstanceRefId> unused_job_instance_refs_;
68 | 
69 |     std::mutex mtx_;
70 | 
71 |     ProfilerClient profiler_client_;
72 | };
73 | 
74 | }
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/include/llis/client/io_shm_entry.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | 
 5 | namespace llis {
 6 | namespace client {
 7 | 
 8 | /*
 9 |  * ptr: the pointer to the memory in the local address space
10 |  * id: identifier that identify the instance of mmap that the piece of memory belongs to
11 |  * offset: one mmap can involve multiple MemoryEntry, and the offset denotes the part of the mmap. Offset is in bytes
12 |  * (id, offset) <=> ptr
13 |  */
14 | struct IoShmEntry {
15 |     void* ptr;
16 |     int id;
17 |     size_t offset;
18 | };
19 | 
20 | }
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/include/llis/client/job_instance_ref.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_channel.h>
 4 | #include <llis/ipc/defs.h>
 5 | #include <llis/client/io_shm_entry.h>
 6 | 
 7 | namespace llis {
 8 | namespace client {
 9 | 
10 | class JobRef;
11 | 
12 | class JobInstanceRef {
13 |   public:
14 |     JobInstanceRef(JobRef* job_ref, IoShmEntry io_shm_entry);
15 |     ~JobInstanceRef();
16 | 
17 |     void launch();
18 |     void release();
19 | 
20 |     void* get_input_ptr();
21 |     void* get_output_ptr();
22 | 
23 |     void set_id(JobInstanceRefId id);
24 |     JobInstanceRefId get_id() const;
25 | 
26 |     JobRefId get_job_ref_id() const;
27 | 
28 |     void set_start_time(double time_point);
29 |     double get_start_time() const;
30 | 
31 |   private:
32 |     JobRef* job_ref_;
33 |     IoShmEntry io_shm_entry_;
34 | 
35 |     JobInstanceRefId id_;
36 | 
37 |     ipc::ShmChannelCpuWriter* c2s_channel_;
38 | 
39 |     double start_time_;
40 | };
41 | 
42 | }
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/include/llis/client/job_ref.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/client/job_instance_ref.h>
 4 | #include <llis/job/job.h>
 5 | #include <llis/ipc/shm_channel.h>
 6 | #include <llis/ipc/defs.h>
 7 | 
 8 | #include <vector>
 9 | #include <string>
10 | 
11 | namespace llis {
12 | namespace client {
13 | 
14 | class Client;
15 | 
16 | class JobRef {
17 |   public:
18 |     JobRef(std::unique_ptr<job::Job> job, Client* client, std::string path);
19 |     ~JobRef();
20 | 
21 |     JobRef(const JobRef&) = delete;
22 |     JobRef(JobRef&&) = default;
23 |     JobRef& operator=(const JobRef&) = delete;
24 |     JobRef& operator=(JobRef&&) = default;
25 | 
26 |     JobInstanceRef* create_instance();
27 |     void release_io_shm_entry(IoShmEntry io_shm_entry);
28 | 
29 |     job::Job* get_job() {
30 |         return job_.get();
31 |     }
32 | 
33 |     Client* get_client() {
34 |         return client_;
35 |     }
36 | 
37 |     ClientId get_client_id() const {
38 |         return client_id_;
39 |     }
40 | 
41 |     JobRefId get_job_ref_id() const {
42 |         return job_ref_id_;
43 |     }
44 | 
45 |     ipc::ShmChannelCpuReader* get_s2c_channel() {
46 |         return s2c_channel_;
47 |     }
48 | 
49 |     ipc::ShmChannelCpuWriter* get_c2s_channel() {
50 |         return c2s_channel_;
51 |     }
52 | 
53 |   private:
54 |     void register_job();
55 | 
56 |     void grow_pool(size_t least_num_new_entries);
57 |     void grow_pool();
58 | 
59 |     std::unique_ptr<job::Job> job_;
60 |     Client* client_;
61 |     std::string model_path_;
62 | 
63 |     ipc::ShmChannelCpuReader* s2c_channel_;
64 |     ipc::ShmChannelCpuWriter* c2s_channel_;
65 |     ClientId client_id_;
66 | 
67 |     size_t pinned_mem_size_;
68 |     size_t param_size_;
69 | 
70 |     size_t pool_size_ = 0; // number of concurrent instances that can be supported
71 |     size_t pool_size_in_bytes_ = 0; // number of bytes of the pool
72 | 
73 |     std::vector<void*> pinned_mem_list_;
74 |     std::vector<IoShmEntry> pinned_mem_free_list_;
75 | 
76 |     std::vector<void*> param_mem_list_;
77 |     std::vector<IoShmEntry> param_mem_free_list_;
78 | 
79 |     std::string shm_name_;
80 |     int shm_fd_;
81 | 
82 |     JobRefId job_ref_id_;
83 | };
84 | 
85 | }
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/include/llis/client/profiler_client.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_channel.h>
 4 | 
 5 | #include <string>
 6 | 
 7 | namespace llis {
 8 | namespace client {
 9 | 
10 | class ProfilerClient {
11 |   public:
12 |     ProfilerClient(ipc::ShmChannelCpuWriter* c2s_channel) : c2s_channel_(c2s_channel) {}
13 | 
14 |     void set_record_kernel_info();
15 |     void unset_record_kernel_info();
16 | 
17 |     void set_record_block_exec_time();
18 |     void unset_record_block_exec_time();
19 | 
20 |     void set_record_kernel_block_mis_alloc();
21 |     void unset_record_kernel_block_mis_alloc();
22 | 
23 |     void set_record_run_next_times();
24 |     void unset_record_run_next_times();
25 | 
26 |     void set_record_job_events();
27 |     void unset_record_job_events();
28 | 
29 |     void set_record_resource_events();
30 |     void unset_record_resource_events();
31 | 
32 |     void save(const std::string& path);
33 | 
34 |   private:
35 |     ipc::ShmChannelCpuWriter* c2s_channel_;
36 | };
37 | 
38 | }
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/include/llis/ipc/atomic_lock.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/utils/gpu.h>
 4 | 
 5 | #include <atomic>
 6 | 
 7 | template <bool for_gpu>
 8 | class AtomicLock {};
 9 | 
10 | template <>
11 | class AtomicLock<false> {
12 |   public:
13 |     inline void acquire() {
14 |         while (val_.test_and_set(std::memory_order_acquire));
15 |     }
16 | 
17 |     inline void release() {
18 |         val_.clear(std::memory_order_release);
19 |     }
20 | 
21 |     inline void init() {
22 |         release();
23 |     }
24 |     
25 |   private:
26 |     std::atomic_flag val_;
27 | };
28 | 
29 | template <>
30 | class AtomicLock<true> {
31 |   public:
32 |     CUDA_HOSTDEV inline void acquire() {
33 | #ifdef __CUDA_ARCH__
34 |         while (atomicOr(&val_gpu_, 1));
35 | #else
36 |         while (val_cpu_.test_and_set(std::memory_order_acquire));
37 | #endif
38 |     }
39 | 
40 |     CUDA_HOSTDEV inline void release() {
41 | #ifdef __CUDA_ARCH__
42 |         val_gpu_ = 0;
43 | #else
44 |         val_cpu_.clear(std::memory_order_release);
45 | #endif
46 |     }
47 | 
48 |     inline void init() {
49 |         *reinterpret_cast<volatile unsigned int*>(&val_gpu_) = 0;
50 |     }
51 | 
52 |   private:
53 |     union {
54 |         unsigned int val_gpu_;
55 |         std::atomic_flag val_cpu_;
56 |     };
57 | };
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/include/llis/ipc/atomic_wrapper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/utils/gpu.h>
 4 | 
 5 | #include <atomic>
 6 | 
 7 | template <typename T, bool for_gpu>
 8 | class AtomicWrapper {};
 9 | 
10 | template <typename T>
11 | class AtomicWrapper<T, false> {
12 |   public:
13 |     inline T load() const {
14 |         return val_.load(std::memory_order_relaxed);
15 |     }
16 | 
17 |     inline void store(T desired) {
18 |         val_.store(desired, std::memory_order_relaxed);
19 |     }
20 | 
21 |     inline void add(T val) {
22 |         val_.fetch_add(val, std::memory_order_relaxed);
23 |     }
24 | 
25 |   private:
26 |     std::atomic<T> val_;
27 | };
28 | 
29 | template <typename T>
30 | class AtomicWrapper<T, true> {
31 |   public:
32 |     CUDA_HOSTDEV inline T load() const {
33 | #ifdef __CUDA_ARCH__
34 |         return val_;
35 | #else
36 |         static_assert(sizeof(std::atomic<T>) == sizeof(T));
37 | 
38 |         std::atomic<T>* tmp = reinterpret_cast<std::atomic<T>*>(const_cast<T*>(&val_));
39 |         return tmp->load(std::memory_order_relaxed);
40 | #endif
41 |     }
42 | 
43 |     CUDA_HOSTDEV inline void store(T desired) {
44 | #ifdef __CUDA_ARCH__
45 |         val_ = desired;
46 | #else
47 |         static_assert(sizeof(std::atomic<T>) == sizeof(T));
48 | 
49 |         std::atomic<T>* tmp = reinterpret_cast<std::atomic<T>*>(const_cast<T*>(&val_));
50 |         tmp->store(desired, std::memory_order_relaxed);
51 | #endif
52 |     }
53 | 
54 |     CUDA_HOSTDEV inline void add(T val) {
55 | #ifdef __CUDA_ARCH__
56 |         // TODO: _system is necessary if both CPU and GPU are writing, but not sure if it is necessary if only GPU is writing and CPU is reading
57 |         atomicAdd(const_cast<T*>(&val_), val);
58 | #else
59 |         static_assert(sizeof(std::atomic<T>) == sizeof(T));
60 | 
61 |         std::atomic<T>* tmp = reinterpret_cast<std::atomic<T>*>(const_cast<T*>(&val_));
62 |         tmp->fetch_add(val, std::memory_order_relaxed);
63 | #endif
64 |     }
65 | 
66 |     CUDA_HOSTDEV inline T inc(T compare) {
67 | #ifdef __CUDA_ARCH__
68 |         return atomicInc(const_cast<T*>(&val_), compare);
69 | #else
70 |         // FIXME: make it actually atomic
71 |         T old = val_;
72 |         val_ = (old >= compare) ? 0 : (old + 1);
73 |         return old;
74 | #endif
75 |     }
76 | 
77 |     CUDA_HOSTDEV inline T cas(T compare, T val) {
78 | #ifdef __CUDA_ARCH__
79 |         return atomicCAS(const_cast<T*>(&val_), compare, val);
80 | #else
81 |         // TODO
82 | #endif
83 |     }
84 | 
85 |   private:
86 |     volatile T val_;
87 | };
88 | 
89 | 


--------------------------------------------------------------------------------
/include/llis/ipc/defs.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | 
 5 | namespace llis {
 6 | 
 7 | using ClientId = uint32_t;
 8 | using JobRefId = uint32_t;
 9 | using JobInstanceRefId = uint32_t;
10 | using JobId = uint32_t;
11 | 
12 | enum class MsgType : uint32_t {
13 |     REGISTER_CLIENT,
14 |     REGISTER_JOB,
15 |     LAUNCH_JOB,
16 |     GROW_POOL,
17 |     CUDA_PROFILER_START,
18 |     CUDA_PROFILER_STOP,
19 |     PROFILER_CMD,
20 |     EXIT_CMD
21 | };
22 | 
23 | enum class ProfilerMsgType : uint32_t {
24 |     SET_RECORD_KERNEL_INFO,
25 |     UNSET_RECORD_KERNEL_INFO,
26 |     SET_RECORD_BLOCK_EXEC_TIME,
27 |     UNSET_RECORD_BLOCK_EXEC_TIME,
28 |     SET_RECORD_KERNEL_BLOCK_MIS_ALLOC,
29 |     UNSET_RECORD_KERNEL_BLOCK_MIS_ALLOC,
30 |     SET_RECORD_RUN_NEXT_TIMES,
31 |     UNSET_RECORD_RUN_NEXT_TIMES,
32 |     SET_RECORD_JOB_EVENTS,
33 |     UNSET_RECORD_JOB_EVENTS,
34 |     SET_RECORD_RESOURCE_EVENTS,
35 |     UNSET_RECORD_RESOURCE_EVENTS,
36 |     SAVE
37 | };
38 | 
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/include/llis/ipc/name_format.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/defs.h>
 4 | 
 5 | #include <string>
 6 | 
 7 | namespace llis {
 8 | namespace ipc {
 9 | 
10 | std::string s2c_socket_name(const std::string& server_name, ClientId client_id);
11 | std::string s2c_channel_name(const std::string& server_name, ClientId client_id);
12 | std::string c2s_channel_name(const std::string& server_name);
13 | 
14 | }
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/include/llis/ipc/shm_primitive_channel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/utils/gpu.h>
 4 | #include <llis/ipc/atomic_wrapper.h>
 5 | 
 6 | #include <string>
 7 | 
 8 | namespace llis {
 9 | namespace ipc {
10 | 
11 | template <typename T, bool for_gpu>
12 | class ShmPrimitiveChannelBase {
13 |   public:
14 |     ShmPrimitiveChannelBase() : shm_(nullptr) {}
15 |     ShmPrimitiveChannelBase(std::string name, size_t count = 0);
16 |     ShmPrimitiveChannelBase(ShmPrimitiveChannelBase<T, for_gpu>* channel) {
17 |         connect(channel);
18 |     }
19 |     ShmPrimitiveChannelBase(size_t count) : ShmPrimitiveChannelBase("", count) {}
20 |     ~ShmPrimitiveChannelBase();
21 | 
22 |     ShmPrimitiveChannelBase(const ShmPrimitiveChannelBase&) = delete;
23 |     ShmPrimitiveChannelBase<T, for_gpu>& operator=(const ShmPrimitiveChannelBase<T, for_gpu>&) = delete;
24 | 
25 |     ShmPrimitiveChannelBase(ShmPrimitiveChannelBase&&);
26 |     ShmPrimitiveChannelBase<T, for_gpu>& operator=(ShmPrimitiveChannelBase<T, for_gpu>&&);
27 | 
28 |     void connect(std::string name, size_t count = 0);
29 |     void connect(ShmPrimitiveChannelBase<T, for_gpu>* channel);
30 | 
31 |     ShmPrimitiveChannelBase<T, for_gpu> fork() {
32 |         ShmPrimitiveChannelBase<T, for_gpu> res;
33 |         res.connect(this);
34 |         return res;
35 |     }
36 | 
37 |     void disconnect();
38 |     bool is_connected();
39 | 
40 |     template <typename U>
41 |     CUDA_HOSTDEV U read();
42 |     template <typename U>
43 |     CUDA_HOSTDEV void write(U val);
44 | 
45 |     template <typename U>
46 |     CUDA_HOSTDEV bool can_read();
47 | 
48 |   private:
49 |     int fd_;
50 |     char* shm_;
51 |     T* ring_buf_;
52 |     size_t count_;
53 |     size_t total_size_;
54 |     bool is_create_;
55 |     std::string name_with_prefix_;
56 | 
57 |     unsigned read_pos_;
58 |     AtomicWrapper<unsigned, for_gpu>* write_pos_;
59 | 
60 |     T cached_head_;
61 | };
62 | 
63 | template <typename T>
64 | using ShmPrimitiveChannel = ShmPrimitiveChannelBase<T, false>;
65 | template <typename T>
66 | using ShmPrimitiveChannelGpu = ShmPrimitiveChannelBase<T, true>;
67 | 
68 | using Gpu2SchedChannel = ShmPrimitiveChannelGpu<uint64_t>;
69 | 
70 | }
71 | }
72 | 
73 | #include "shm_primitive_channel_impl.h"
74 | 
75 | 


--------------------------------------------------------------------------------
/include/llis/ipc/shm_primitive_channel_impl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_primitive_channel.h>
 4 | #include <llis/utils/gpu.h>
 5 | #include <llis/utils/align.h>
 6 | 
 7 | #include <sys/mman.h>
 8 | #include <sys/stat.h>
 9 | #include <fcntl.h>
10 | #include <unistd.h>
11 | 
12 | #include <cstring>
13 | #include <cassert>
14 | 
15 | namespace llis {
16 | namespace ipc {
17 | 
18 | template <typename T, bool for_gpu>
19 | template <typename U>
20 | CUDA_HOSTDEV U ShmPrimitiveChannelBase<T, for_gpu>::read() {
21 |     static_assert(sizeof(T) == sizeof(U), "The type being read must be of the same size as the type of the channel");
22 | 
23 |     U* ptr = reinterpret_cast<U*>(ring_buf_ + read_pos_);
24 |     U* cached_head_u_ptr = reinterpret_cast<U*>(&cached_head_);
25 |     AtomicWrapper<T, for_gpu>* ptr_atomic = reinterpret_cast<AtomicWrapper<T, for_gpu>*>(ptr);
26 | 
27 |     while (!cached_head_u_ptr->can_read()) {
28 |         cached_head_ = ptr_atomic->load();
29 |     }
30 | 
31 |     cached_head_u_ptr->set_can_write();
32 |     ptr_atomic->store(cached_head_);
33 | 
34 |     if (read_pos_ == count_ - 1) {
35 |         read_pos_ = 0;
36 |     } else {
37 |         ++read_pos_;
38 |     }
39 | 
40 |     return *cached_head_u_ptr;
41 | }
42 | 
43 | template <typename T, bool for_gpu>
44 | template <typename U>
45 | CUDA_HOSTDEV void ShmPrimitiveChannelBase<T, for_gpu>::write(U val) {
46 |     // TODO: it is probably possible to remove the critical session between acquire and store
47 |     // Not sure which one has better performance
48 | 
49 |     static_assert(sizeof(T) == sizeof(U), "The type being written must be of the same size as the type of the channel");
50 | 
51 |     size_t write_pos = write_pos_->inc(count_ - 1);
52 |     U* ptr = reinterpret_cast<U*>(ring_buf_ + write_pos);
53 | 
54 |     reinterpret_cast<AtomicWrapper<T, for_gpu>*>(ptr)->store(*reinterpret_cast<T*>(&val));
55 | }
56 | 
57 | template <typename T, bool for_gpu>
58 | template <typename U>
59 | CUDA_HOSTDEV bool ShmPrimitiveChannelBase<T, for_gpu>::can_read() {
60 |     U* ptr = reinterpret_cast<U*>(ring_buf_ + read_pos_);
61 |     cached_head_ = reinterpret_cast<AtomicWrapper<T, for_gpu>*>(ptr)->load();
62 | 
63 |     return reinterpret_cast<U*>(&cached_head_)->can_read();
64 | }
65 | 
66 | 
67 | }
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/include/llis/ipc/threadfence_wrapper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/utils/gpu.h>
 4 | 
 5 | #include <atomic>
 6 | 
 7 | template <typename T, bool for_gpu>
 8 | class ThreadfenceWrapper {};
 9 | 
10 | template <typename T>
11 | class ThreadfenceWrapper<T, false> {
12 |   public:
13 |     inline T load() const {
14 |         return val_.load(std::memory_order_acquire);
15 |     }
16 | 
17 |     inline void store(T desired) {
18 |         val_.store(desired, std::memory_order_release);
19 |     }
20 | 
21 |   private:
22 |     std::atomic<T> val_;
23 | };
24 | 
25 | template <typename T>
26 | class ThreadfenceWrapper<T, true> {
27 |   public:
28 |     CUDA_HOSTDEV inline T load() const {
29 | #ifdef __CUDA_ARCH__
30 |         T val = val_;
31 |         __threadfence_system();
32 |         return val;
33 | #else
34 |         std::atomic<T>* tmp = reinterpret_cast<std::atomic<T>*>(const_cast<T*>(&val_));
35 |         return tmp->load(std::memory_order_acquire);
36 | #endif
37 |     }
38 | 
39 |     CUDA_HOSTDEV inline void store(T desired) {
40 | #ifdef __CUDA_ARCH__
41 |         __threadfence_system();
42 |         val_ = desired;
43 | #else
44 |         std::atomic<T>* tmp = reinterpret_cast<std::atomic<T>*>(const_cast<T*>(&val_));
45 |         tmp->store(desired, std::memory_order_release);
46 | #endif
47 |     }
48 | 
49 |   private:
50 |     volatile T val_;
51 | };
52 | 
53 | 


--------------------------------------------------------------------------------
/include/llis/ipc/unix_datagram_socket.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | #include <sys/un.h>
 6 | 
 7 | namespace llis {
 8 | namespace ipc {
 9 | 
10 | class UnixDatagramSocket {
11 |   public:
12 |     UnixDatagramSocket();
13 |     UnixDatagramSocket(const std::string& name);
14 | 
15 |     UnixDatagramSocket(UnixDatagramSocket&&);
16 |     UnixDatagramSocket& operator=(UnixDatagramSocket&&);
17 | 
18 |     ~UnixDatagramSocket();
19 | 
20 |     void bind(const std::string& name);
21 |     UnixDatagramSocket connect(const std::string& name);
22 | 
23 |     ssize_t write(const void* buf, size_t count);
24 |     ssize_t read(void* buf, size_t count);
25 | 
26 |   private:
27 |     UnixDatagramSocket(int socket);
28 | 
29 |     int socket_;
30 |     bool is_owner_;
31 | 
32 |     sockaddr_un remote_addr_;
33 | };
34 | 
35 | }
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/include/llis/job/context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_channel.h>
 4 | #include <llis/ipc/shm_primitive_channel.h>
 5 | #include <llis/job/job.h>
 6 | 
 7 | namespace llis {
 8 | namespace job {
 9 | 
10 | class Context {
11 |   public:
12 |     static Job* get_current_job() {
13 |         return current_job_;
14 |     }
15 | 
16 |     static void set_current_job(Job* job) {
17 |         current_job_ = job;
18 |     }
19 | 
20 |     static void set_gpu2sched_channel(ipc::Gpu2SchedChannel* gpu2sched_channel) {
21 |         gpu2sched_channel_ = gpu2sched_channel->fork();
22 |     }
23 | 
24 |     static ipc::Gpu2SchedChannel* get_gpu2sched_channel() {
25 |         return &gpu2sched_channel_;
26 |     }
27 | 
28 | #ifdef LLIS_MEASURE_BLOCK_TIME
29 |     static void set_gpu2sched_block_time_channel(ipc::Gpu2SchedChannel* gpu2sched_block_time_channel) {
30 |         gpu2sched_block_time_channel_ = gpu2sched_block_time_channel->fork();
31 |     }
32 | 
33 |     static ipc::Gpu2SchedChannel* get_gpu2sched_block_time_channel() {
34 |         return &gpu2sched_block_time_channel_;
35 |     }
36 | #endif
37 | 
38 |     static void set_mem2sched_channel(ipc::ShmChannelCpuReader* mem2sched_channel) {
39 |         mem2sched_channel_ = mem2sched_channel->fork();
40 |     }
41 | 
42 |     static ipc::ShmChannelCpuWriter* get_mem2sched_channel() {
43 |         return &mem2sched_channel_;
44 |     }
45 | 
46 |   private:
47 |     static Job* current_job_;
48 |     static ipc::Gpu2SchedChannel gpu2sched_channel_;
49 | #ifdef LLIS_MEASURE_BLOCK_TIME
50 |     static ipc::Gpu2SchedChannel gpu2sched_block_time_channel_;
51 | #endif
52 |     static ipc::ShmChannelCpuWriter mem2sched_channel_;
53 | };
54 | 
55 | }
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/include/llis/job/coroutine_job.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <boost/coroutine2/all.hpp>
 4 | #include <llis/job/job.h>
 5 | 
 6 | namespace llis {
 7 | namespace job {
 8 | 
 9 | class CoroutineJob : public Job {
10 |   public:
11 |     void full_init(void* io_ptr) final {
12 |         one_time_init();
13 |         init(io_ptr);
14 |     }
15 | 
16 |     void init(void* io_ptr) final {
17 |         // Note that this will run the coroutine immediately. This is necessary because we need the coroutine to setup resource requirements for the first kernel
18 | 
19 |         coroutine_pull_ = std::make_unique<boost::coroutines2::coroutine<void>::pull_type>([this, io_ptr](boost::coroutines2::coroutine<void>::push_type& coroutine_push) {
20 |             coroutine_push_ = &coroutine_push;
21 |             body(io_ptr);
22 |         });
23 |     }
24 | 
25 |     void run_next() final {
26 |         (*coroutine_pull_)();
27 |     }
28 | 
29 |     virtual void one_time_init() = 0;
30 | 
31 |     virtual void body(void* io_ptr) = 0;
32 | 
33 |     bool has_next() const final {
34 |         return (bool)(*coroutine_pull_);
35 |     }
36 | 
37 |     void yield() {
38 |         (*coroutine_push_)();
39 |     }
40 | 
41 |   private:
42 |     std::unique_ptr<boost::coroutines2::coroutine<void>::pull_type> coroutine_pull_;
43 |     boost::coroutines2::coroutine<void>::push_type* coroutine_push_;
44 | };
45 | 
46 | }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/include/llis/job/instrument.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_primitive_channel.h>
 4 | #include <llis/job/instrument_info.h>
 5 | 
 6 | namespace llis {
 7 | namespace job {
 8 | 
 9 | __device__ inline void kernel_start(JobId job_id, ipc::Gpu2SchedChannel* gpu2sched_channel
10 | #ifdef LLIS_MEASURE_BLOCK_TIME
11 |         , BlockStartEndTime* start_end_time
12 | #endif
13 | ) {
14 |     if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
15 | #ifdef LLIS_MEASURE_BLOCK_TIME
16 |         unsigned clock_val = clock64() >> 8;
17 |         clock_val &= 0xFFFFFF;
18 |         start_end_time->data[0] = clock_val >> 8;
19 |         start_end_time->data[1] = (clock_val & 0xFF) << 8;
20 | #endif
21 | 
22 |         unsigned smid;
23 |         asm("mov.u32 %0, %smid;" : "=r"(smid));
24 | 
25 |         InstrumentInfo info;
26 |         info.is_start = 1;
27 |         info.smid = smid;
28 |         info.job_id = job_id;
29 | 
30 |         gpu2sched_channel->write(info);
31 |     }
32 | }
33 | 
34 | __device__ inline void kernel_end(JobId job_id, ipc::Gpu2SchedChannel* gpu2sched_channel
35 | #ifdef LLIS_MEASURE_BLOCK_TIME
36 |         , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel
37 |         , BlockStartEndTime* start_end_time
38 | #endif
39 | ) {
40 |     __syncthreads();
41 | 
42 |     if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
43 | #ifdef LLIS_MEASURE_BLOCK_TIME
44 |         unsigned clock_val = clock64() >> 8;
45 |         clock_val &= 0xFFFFFF;
46 |         start_end_time->data[1] |= clock_val >> 16;
47 |         start_end_time->data[2] = clock_val & 0xFFFF;
48 | #endif
49 | 
50 |         unsigned smid;
51 |         asm("mov.u32 %0, %smid;" : "=r"(smid));
52 | 
53 |         InstrumentInfo info;
54 |         info.is_start = 0;
55 |         info.smid = smid;
56 |         info.job_id = job_id;
57 | 
58 |         gpu2sched_channel->write(info);
59 | #ifdef LLIS_MEASURE_BLOCK_TIME
60 |         gpu2sched_block_time_channel->write(*start_end_time);
61 | #endif
62 |     }
63 | }
64 | 
65 | }
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/include/llis/job/instrument_info.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/atomic_wrapper.h>
 4 | #include <llis/ipc/defs.h>
 5 | 
 6 | namespace llis {
 7 | namespace job {
 8 | 
 9 | class InstrumentInfo {
10 |   public:
11 |     uint8_t is_start;
12 |     uint8_t smid;
13 |     uint8_t num;
14 |   private:
15 |     uint8_t status_;
16 |   public:
17 |     JobId job_id;
18 | 
19 |     CUDA_HOSTDEV InstrumentInfo() {
20 |         set_can_read();
21 |     }
22 | 
23 |     CUDA_HOSTDEV bool can_read() const {
24 |         return status_ == 1;
25 |     }
26 | 
27 |     CUDA_HOSTDEV bool can_write() const {
28 |         return status_ == 0;
29 |     }
30 | 
31 |     CUDA_HOSTDEV void set_can_read() {
32 |         status_ = 1;
33 |     }
34 | 
35 |     CUDA_HOSTDEV void set_can_write() {
36 |         status_ = 0;
37 |     }
38 | };
39 | 
40 | #ifdef LLIS_MEASURE_BLOCK_TIME
41 | 
42 | class BlockStartEndTime {
43 |   private:
44 |     uint8_t status_;
45 |     uint8_t dummy_;
46 |   public:
47 |     uint16_t data[3];
48 | 
49 |     CUDA_HOSTDEV BlockStartEndTime() {
50 |         set_can_read();
51 |     }
52 | 
53 |     CUDA_HOSTDEV bool can_read() const {
54 |         return status_ == 1;
55 |     }
56 | 
57 |     CUDA_HOSTDEV bool can_write() const {
58 |         return status_ == 0;
59 |     }
60 | 
61 |     CUDA_HOSTDEV void set_can_read() {
62 |         status_ = 1;
63 |     }
64 | 
65 |     CUDA_HOSTDEV void set_can_write() {
66 |         status_ = 0;
67 |     }
68 | };
69 | 
70 | #endif
71 | 
72 | }
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/include/llis/job/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/job/job.h>
 4 | 
 5 | namespace llis {
 6 | namespace job {
 7 | 
 8 | void memset_res(size_t count, Job* job);
 9 | void memset(void* ptr, int val, size_t count, Job* job, ipc::ShmChannelGpu* gpu2sched_channel);
10 | 
11 | }
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/include/llis/server/client_connection.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_channel.h>
 4 | #include <llis/ipc/unix_datagram_socket.h>
 5 | #include <llis/ipc/defs.h>
 6 | 
 7 | namespace llis {
 8 | namespace server {
 9 | 
10 | class ClientConnection {
11 |   public:
12 |     ClientConnection(ClientId client_id) : client_id_(client_id) {}
13 | 
14 |     void use_s2c_channel(ipc::ShmChannelCpuWriter&& s2c_channel) {
15 |         s2c_channel_ = std::move(s2c_channel);
16 |     }
17 | 
18 |     void use_s2c_socket(ipc::UnixDatagramSocket&& sock) {
19 |         s2c_socket_ = std::move(sock);
20 |     }
21 | 
22 |     ipc::ShmChannelCpuWriter* get_s2c_channel() {
23 |         return &s2c_channel_;
24 |     }
25 | 
26 |     ipc::UnixDatagramSocket* get_s2c_socket() {
27 |         return &s2c_socket_;
28 |     }
29 | 
30 |     ClientId get_client_id() const {
31 |         return client_id_;
32 |     }
33 | 
34 |   private:
35 |     ClientId client_id_;
36 |     
37 |     ipc::ShmChannelCpuWriter s2c_channel_;
38 |     ipc::UnixDatagramSocket s2c_socket_;
39 | };
40 | 
41 | }
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/include/llis/server/gpu_resources.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/job/job.h>
 4 | #include <llis/server/sm_resources.h>
 5 | #include <llis/server/profiler.h>
 6 | 
 7 | namespace llis {
 8 | namespace server {
 9 | 
10 | class GpuResources {
11 |   public:
12 |     GpuResources();
13 | 
14 |     void acquire(int smid, job::Job* job, int num);
15 |     void release(int smid, job::Job* job, int num);
16 | 
17 |     bool job_fits(job::Job* job) const;
18 | 
19 |     unsigned get_num_sms() const {
20 |         return sms_resources_.size();
21 |     }
22 | 
23 |     bool is_full() const {
24 |         return num_full_sms_ >= sms_resources_.size();
25 |     }
26 | 
27 |     void choose_sms(job::Job* job);
28 | 
29 |     double dot(job::Job* job) const;
30 |     double dot_normalized(job::Job* job) const;
31 |     float normalize_resources(job::Job* job) const;
32 | 
33 | #ifdef LLIS_ENABLE_PROFILER
34 |     void set_profiler(Profiler* profiler) {
35 |         for (auto& sm_resources : sms_resources_) {
36 |             sm_resources.set_profiler(profiler);
37 |         }
38 |     }
39 | #endif
40 | 
41 |   private:
42 |     std::vector<SmResources> sms_resources_;
43 |     SmResources total_resources_;
44 |     unsigned num_full_sms_ = 0;
45 | 
46 |     std::vector<unsigned> gpc_num_blocks_;
47 |     std::vector<unsigned> gpc_next_sms_;
48 |     // TODO: detect the actual allocation
49 |     constexpr static unsigned gpc_sms_[5][8] = {{0, 10, 20, 30, 1, 11, 21, 31}, {2, 12, 22, 32, 3, 13, 23, 33}, {4, 14, 24, 34, 5, 15, 25, 35}, {6, 16, 26, 36, 7, 17, 27, 37}, {8, 18, 28, 38, 9, 19, 29, 39}};
50 | };
51 | 
52 | }
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/include/llis/server/registered_job.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_channel.h>
 4 | #include <llis/server/client_connection.h>
 5 | #include <llis/job/job.h>
 6 | 
 7 | #include <vector>
 8 | #include <memory>
 9 | 
10 | namespace llis {
11 | namespace server {
12 | 
13 | class RegisteredJob {
14 |   public:
15 |     RegisteredJob(JobRefId registered_job_id,
16 |                   ipc::ShmChannelCpuReader* c2s_channel_,
17 |                   ClientConnection* client_connection);
18 |     RegisteredJob(RegisteredJob&&) = default;
19 | 
20 |     void init(ipc::ShmChannelCpuReader* c2s_channel,
21 |               ClientConnection* client_connection);
22 | 
23 |     std::unique_ptr<job::Job> create_instance();
24 |     void grow_pool();
25 |     std::unique_ptr<job::Job> init_job();
26 |     void release_instance(std::unique_ptr<job::Job> job);
27 | 
28 |     void update_stage_length(unsigned stage_id, double len);
29 |     void set_stage_resource(unsigned stage_id, float res);
30 |     bool has_stage_resource(unsigned stage_id);
31 |     double get_stage_length(unsigned stage_id) const;
32 |     float get_stage_resource(unsigned stage_id) const;
33 |     double get_remaining_length(unsigned from_stage) const;
34 |     double get_remaining_rl(unsigned from_stage) const;
35 | 
36 |     const std::vector<double>& get_stage_lengths() const {
37 |         return stage_lengths_;
38 |     }
39 | 
40 |     const std::vector<float>& get_stage_resources() const {
41 |         return stage_resources_;
42 |     }
43 | 
44 |   private:
45 |     typedef job::Job* (*init_job_t)();
46 | 
47 |     JobRefId registered_job_id_;
48 |     ipc::ShmChannelCpuReader* c2s_channel_;
49 |     ClientConnection* client_connection_;
50 | 
51 |     ipc::ShmChannelCpuWriter* s2c_channel_;
52 |     init_job_t init_job_;
53 |     job::Job* job_;
54 |     std::string shm_name_;
55 |     int shm_fd_;
56 | 
57 |     size_t pool_size_in_bytes_;
58 |     std::vector<void*> mapped_mem_;
59 | 
60 |     std::vector<std::unique_ptr<job::Job>> unused_job_instances_;
61 | 
62 |     std::vector<double> stage_lengths_;
63 | #ifdef PRINT_STAGE_LENGTH_STDDEV
64 |     std::vector<double> stage_lengths_sum_;
65 |     std::vector<double> stage_lengths_sum_sqr_;
66 |     std::vector<unsigned long long> stage_lengths_num_;
67 | #endif
68 |     std::vector<float> stage_resources_;
69 | };
70 | 
71 | }
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/include/llis/server/scheduler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/server/server.h>
 4 | 
 5 | #include <boost/program_options.hpp>
 6 | 
 7 | #include <functional>
 8 | #include <unordered_map>
 9 | #include <string>
10 | #include <memory>
11 | 
12 | namespace llis {
13 | 
14 | namespace po = boost::program_options;
15 | 
16 | namespace server {
17 | 
18 | class Scheduler {
19 |   public:
20 |     virtual void set_server(Server* server) {
21 |         server_ = server;
22 |         profiler_ = server_->get_profiler();
23 |     }
24 | 
25 |     virtual void try_handle_block_start_finish() = 0;
26 |     virtual void handle_new_job(std::unique_ptr<job::Job> job) = 0;
27 | 
28 |   protected:
29 |     Server* server_;
30 |     Profiler* profiler_;
31 | };
32 | 
33 | class SchedulerFactory {
34 |   public:
35 |     using RegisterFunc = std::function<std::unique_ptr<Scheduler>(const po::variables_map&)>;
36 | 
37 |     static bool register_scheduler(std::string name, RegisterFunc func);
38 |     static std::unique_ptr<Scheduler> create(std::string name, const po::variables_map& args);
39 | 
40 |   private:
41 |     static std::unordered_map<std::string, RegisterFunc> registered_schedulers_; 
42 | };
43 | 
44 | #define LLIS_SCHEDULER_REGISTER(name, args) \
45 |     static bool __scheduler_register_ = llis::server::SchedulerFactory::register_scheduler(name, args);
46 | 
47 | }
48 | }
49 | 


--------------------------------------------------------------------------------
/include/llis/server/scheduler_fifo.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/job/finished_block_notifier.h>
 4 | #include <llis/ipc/shm_primitive_channel.h>
 5 | #include <llis/job/job.h>
 6 | #include <llis/job/instrument_info.h>
 7 | #include <llis/server/server.h>
 8 | #include <llis/server/gpu_resources.h>
 9 | #include <llis/utils/logging.hh>
10 | #include <llis/server/scheduler.h>
11 | 
12 | #include <cuda_runtime.h>
13 | 
14 | #include <deque>
15 | #include <queue>
16 | #include <vector>
17 | #include <memory>
18 | 
19 | #define GPU2SCHED_CHAN_SIZE 1024000
20 | #define GPU2SCHED_CHAN_SIZE_TIME 10240000
21 | 
22 | namespace llis {
23 | namespace server {
24 | 
25 | class SchedulerFifo : public Scheduler {
26 |   public:
27 |     SchedulerFifo(unsigned num_streams, unsigned sched_sleep);
28 | 
29 |     void handle_new_job(std::unique_ptr<job::Job> job) override;
30 |     void try_handle_block_start_finish() override;
31 | 
32 |   private:
33 |     void handle_block_start_finish();
34 | #ifdef LLIS_MEASURE_BLOCK_TIME
35 |     void handle_block_start_end_time();
36 | #endif
37 |     void handle_block_start(const job::InstrumentInfo& info);
38 |     void handle_block_finish(const job::InstrumentInfo& info);
39 |     void handle_mem_finish();
40 | 
41 |     void schedule_job();
42 | 
43 |     static void mem_notification_callback(void* job);
44 | 
45 |     ipc::ShmPrimitiveChannelGpu<uint64_t> gpu2sched_channel_;
46 | #ifdef LLIS_MEASURE_BLOCK_TIME
47 |     ipc::ShmPrimitiveChannelGpu<uint64_t> gpu2sched_block_time_channel_;
48 | #endif
49 |     ipc::ShmChannelCpuReader mem2sched_channel_;
50 |     
51 |     std::vector<cudaStream_t> cuda_streams_;
52 |     job::FinishedBlockNotifier* finished_block_notifiers_raw_;
53 |     std::vector<job::FinishedBlockNotifier*> finished_block_notifiers_;
54 | 
55 |     std::queue<job::Job*> job_queue_;
56 | 
57 |     std::vector<std::unique_ptr<job::Job>> job_id_to_job_map_;
58 |     std::vector<JobId> unused_job_id_;
59 | 
60 |     std::vector<unsigned> remaining_num_blocks_;
61 |     std::vector<int> pre_notify_blocks_;
62 |     std::vector<bool> pre_notify_sent_;
63 | 
64 |     unsigned num_jobs_ = 0;
65 | 
66 | #ifdef PRINT_NUM_RUNNING_JOBS
67 |     unsigned num_running_jobs_ = 0;
68 | #endif
69 | #ifdef PRINT_NUM_RUNNING_BLOCKS
70 |     unsigned num_running_blocks_ = 0;
71 | #endif
72 | };
73 | 
74 | }
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/include/llis/server/scheduler_fifo2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_primitive_channel.h>
 4 | #include <llis/job/job.h>
 5 | #include <llis/job/instrument_info.h>
 6 | #include <llis/server/server.h>
 7 | #include <llis/server/gpu_resources.h>
 8 | #include <llis/utils/logging.hh>
 9 | #include <llis/job/finished_block_notifier.h>
10 | #include <llis/server/scheduler.h>
11 | 
12 | #include <cuda_runtime.h>
13 | 
14 | #include <deque>
15 | #include <queue>
16 | #include <vector>
17 | #include <memory>
18 | 
19 | #define GPU2SCHED_CHAN_SIZE 1024000
20 | #define GPU2SCHED_CHAN_SIZE_TIME 10240000
21 | 
22 | namespace llis {
23 | namespace server {
24 | 
25 | class SchedulerFifo2 : public Scheduler {
26 |   public:
27 |     SchedulerFifo2(unsigned num_streams, unsigned sched_sleep);
28 | 
29 |     void handle_new_job(std::unique_ptr<job::Job> job) override;
30 |     void try_handle_block_start_finish() override;
31 | 
32 |   private:
33 |     class JobCompare {
34 |       public:
35 |         bool operator() (const job::Job* left, const job::Job* right) const {
36 |             return left->get_unique_id() > right->get_unique_id();
37 |         }
38 |     };
39 | 
40 |     void handle_block_start_finish();
41 | #ifdef LLIS_MEASURE_BLOCK_TIME
42 |     void handle_block_start_end_time();
43 | #endif
44 |     void handle_block_start(const job::InstrumentInfo& info);
45 |     void handle_block_finish(const job::InstrumentInfo& info);
46 |     void handle_mem_finish();
47 | 
48 |     void schedule_job();
49 | 
50 |     static void mem_notification_callback(void* job);
51 | 
52 |     ipc::ShmPrimitiveChannelGpu<uint64_t> gpu2sched_channel_;
53 | #ifdef LLIS_MEASURE_BLOCK_TIME
54 |     ipc::ShmPrimitiveChannelGpu<uint64_t> gpu2sched_block_time_channel_;
55 | #endif
56 |     ipc::ShmChannelCpuReader mem2sched_channel_;
57 |     
58 |     std::vector<cudaStream_t> cuda_streams_;
59 |     job::FinishedBlockNotifier* finished_block_notifiers_raw_;
60 |     std::vector<job::FinishedBlockNotifier*> finished_block_notifiers_;
61 | 
62 |     std::priority_queue<job::Job*, std::vector<job::Job*>, JobCompare> job_queue_;
63 | 
64 |     std::vector<std::unique_ptr<job::Job>> job_id_to_job_map_;
65 |     std::vector<JobId> unused_job_id_;
66 | 
67 |     unsigned num_jobs_ = 0;
68 | 
69 | #ifdef PRINT_NUM_RUNNING_KERNELS
70 |     unsigned num_running_kernels_ = 0;
71 |     unsigned num_running_mems_ = 0;
72 | #endif
73 | };
74 | 
75 | }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/include/llis/server/server.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/ipc/shm_channel.h>
 4 | #include <llis/ipc/unix_datagram_socket.h>
 5 | #include <llis/server/client_connection.h>
 6 | #include <llis/ipc/defs.h>
 7 | #include <llis/server/registered_job.h>
 8 | #include <llis/server/profiler.h>
 9 | #include <llis/utils/logging.hh>
10 | 
11 | #include <vector>
12 | #include <string>
13 | 
14 | #define SER2SCHED_CHAN_SIZE 1024
15 | #define CLT2SCHED_CHAN_SIZE 1024
16 | 
17 | namespace llis {
18 | namespace server {
19 | 
20 | class Scheduler;
21 | 
22 | constexpr size_t s2c_channel_size = 4096;
23 | 
24 | class Server {
25 |   public:
26 |     Server(std::string server_name, Scheduler* scheduler);
27 | 
28 |     void serve();
29 | 
30 |     void notify_job_starts(job::Job* job);
31 |     void notify_job_ends(job::Job* job);
32 | 
33 |     void release_job_instance(std::unique_ptr<job::Job> job);
34 | 
35 |     void update_job_stage_length(job::Job* job, unsigned stage_id, double len);
36 |     void set_job_stage_resource(job::Job* job, unsigned stage_id, float res);
37 |     bool has_job_stage_resource(job::Job* job, unsigned stage_id);
38 | 
39 |     double get_job_remaining_length(job::Job* job, unsigned from_stage) const;
40 |     double get_job_remaining_rl(job::Job* job, unsigned from_stage) const;
41 | 
42 |     const std::vector<double>& get_job_stage_lengths(job::Job* job) const;
43 |     const std::vector<float>& get_job_stage_resources(job::Job* job) const;
44 | 
45 |     Profiler* get_profiler() {
46 |         return &profiler_;
47 |     }
48 | 
49 |   private:
50 |     void try_handle_c2s();
51 |     void handle_c2s();
52 |     void handle_register_client();
53 |     void handle_register_job();
54 |     void handle_launch_job();
55 |     void handle_grow_pool();
56 |     void handle_release_job_instance();
57 | 
58 |     std::string server_name_;
59 |     Scheduler* scheduler_;
60 |     ipc::UnixDatagramSocket s2c_socket_;
61 | 
62 |     ipc::ShmChannelCpuReader c2s_channel_;
63 | 
64 |     std::vector<ClientConnection> client_connections_;
65 |     std::vector<ClientId> unused_client_connections_;
66 | 
67 |     std::vector<RegisteredJob> registered_jobs_;
68 |     std::vector<JobRefId> unused_registered_jobs_;
69 | 
70 |     Profiler profiler_;
71 | };
72 | 
73 | }
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/include/llis/server/sm_resources.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/job/job.h>
 4 | #include <llis/server/profiler.h>
 5 | 
 6 | namespace llis {
 7 | namespace server {
 8 | 
 9 | class SmResources {
10 |   public:
11 |     SmResources(int nregs, int smem, int nthrs, int nblocks);
12 |     SmResources();
13 | 
14 |     void acquire(job::Job* job, int num);
15 |     void release(job::Job* job, int num);
16 | 
17 |     double dot(job::Job* job) const;
18 |     double dot_normalized(job::Job* job) const;
19 |     float normalize_resources(job::Job* job) const;
20 | 
21 |     unsigned num_blocks(job::Job* job) const;
22 | 
23 |     double occupancy() const;
24 | 
25 |     bool is_full() const {
26 |         return nregs_ <= 0 || smem_ <= 0 || nthrs_ <= 0 || nblocks_ <= 0;
27 |     }
28 | 
29 |     bool job_fits(job::Job* job) const {
30 |         return nregs_ >= (int)job->get_num_registers_per_thread() * (int)job->get_num_threads_per_block() * (int)job->get_num_blocks() && smem_ >= (int)job->get_smem_size_per_block() * (int)job->get_num_blocks() && nthrs_ >= (int)job->get_num_threads_per_block() * (int)job->get_num_blocks() && nblocks_ >= (int)job->get_num_blocks();
31 |     }
32 | 
33 | #ifdef LLIS_ENABLE_PROFILER
34 |     void set_profiler(Profiler* profiler) {
35 |         profiler_ = profiler;
36 |     }
37 | #endif
38 | 
39 |   private:
40 |     int nregs_ = 0;
41 |     int smem_ = 0;
42 |     int nthrs_ = 0;
43 |     int nblocks_ = 0;
44 | 
45 |     int max_nregs_ = 0;
46 |     int max_smem_ = 0;
47 |     int max_nthrs_ = 0;
48 |     int max_nblocks_ = 0;
49 | 
50 |     double max_resources_dot_prod_;
51 | 
52 |     Profiler* profiler_;
53 | };
54 | 
55 | }
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/include/llis/utils/align.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <llis/utils/gpu.h>
 4 | 
 5 | #include <cstddef>
 6 | #include <cstdint>
 7 | 
 8 | namespace llis {
 9 | namespace utils {
10 | 
11 | CUDA_HOSTDEV inline size_t next_aligned_pos(size_t next_pos, size_t align) {
12 |     return (next_pos + align - 1) & ~(align - 1);
13 | }
14 | 
15 | template <typename T>
16 | CUDA_HOSTDEV inline T* next_aligned_ptr(T* next_ptr, size_t align) {
17 |     return reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(next_ptr) + align - 1) & ~(align - 1);
18 | }
19 | 
20 | }
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/include/llis/utils/gpu.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // From https://stackoverflow.com/a/32015007/1213644
 4 | #ifdef __CUDACC__
 5 | #define CUDA_HOSTDEV __host__ __device__
 6 | #else
 7 | #define CUDA_HOSTDEV
 8 | #endif
 9 | 
10 | 


--------------------------------------------------------------------------------
/include/llis/utils/ops.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ops.h - useful x86_64 instructions
 3 |  */
 4 | 
 5 | #pragma once
 6 | 
 7 | static inline uint64_t rdtsc(void) {
 8 | 	uint32_t a, d;
 9 | 	asm volatile("rdtsc" : "=a" (a), "=d" (d));
10 | 	return ((uint64_t)a) | (((uint64_t)d) << 32);
11 | }
12 | 
13 | static inline uint64_t rdtscp(uint32_t *auxp) {
14 | 	uint32_t a, d, c;
15 | 	asm volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
16 | 	if (auxp)
17 | 		*auxp = c;
18 | 	return ((uint64_t)a) | (((uint64_t)d) << 32);
19 | }
20 | 


--------------------------------------------------------------------------------
/include/llis/utils/path.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <filesystem>
 5 | 
 6 | namespace llis {
 7 | namespace utils {
 8 | 
 9 | namespace internal {
10 | 
11 | template <typename T>
12 | std::filesystem::path path_concat_internal(T path_str) {
13 |     return std::filesystem::path(path_str);
14 | }
15 | 
16 | template <typename T, typename... Args>
17 | std::filesystem::path path_concat_internal(T path_str, Args... paths_str) {
18 |     std::filesystem::path res = path_concat_internal(paths_str...);
19 |     res = std::filesystem::path(path_str) / res;
20 |     return res;
21 | }
22 | 
23 | }
24 | 
25 | template <typename... Args>
26 | std::string path_concat(Args... paths_str) {
27 |     return internal::path_concat_internal(paths_str...).string();
28 | }
29 | 
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/include/llis/utils/time.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LLIS_TIME_H_IS_INCLUDED
 2 | #define LLIS_TIME_H_IS_INCLUDED
 3 | 
 4 | #define CPU_FREQ 2.5 // Adjust with your cpu speed
 5 | 
 6 | #include <llis/utils/ops.h>
 7 | #include <chrono>
 8 | using hr_clock = std::chrono::steady_clock;
 9 | typedef hr_clock::time_point tp;
10 | 
11 | uint64_t since_epoch(const tp &time);
12 | uint64_t ns_diff(const tp &start, const tp &end);
13 | 
14 | static const auto system_start_time = hr_clock::now();
15 | 
16 | static inline uint64_t cycles_to_ns(uint64_t time) {
17 |     return time / CPU_FREQ;
18 | }
19 | 
20 | #endif /* LLIS_TIME_H_IS_INCLUDED */
21 | 


--------------------------------------------------------------------------------
/jobs/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(CMAKE_INSTALL_RPATH $ORIGIN $ORIGIN/..)
 2 | 
 3 | add_subdirectory(helloworld)
 4 | add_subdirectory(helloworld_coroutine)
 5 | add_subdirectory(vec_add_coroutine)
 6 | add_subdirectory(run_forever)
 7 | add_subdirectory(dummy_long)
 8 | add_subdirectory(dummy_short)
 9 | add_subdirectory(dummy_10)
10 | add_subdirectory(dummy_11)
11 | add_subdirectory(dummy_20)
12 | add_subdirectory(dummy_21)
13 | # Have to fix it before running it
14 | #add_subdirectory(cnn)
15 | 
16 | if(tvm_FOUND)
17 |     add_subdirectory(tvm_mnist)
18 |     add_subdirectory(tvm_mobilenet)
19 |     add_subdirectory(tvm_resnet18)
20 |     add_subdirectory(tvm_inception_v3)
21 |     add_subdirectory(tvm_googlenet)
22 |     add_subdirectory(tvm_ultraface320)
23 |     add_subdirectory(tvm_densenet121)
24 |     add_subdirectory(tvm_arcfaceresnet100)
25 |     add_subdirectory(tvm_resnet50)
26 |     add_subdirectory(tvm_resnet34)
27 |     add_subdirectory(tvm_squeezenet1_1)
28 | endif(tvm_FOUND)
29 | 
30 | 


--------------------------------------------------------------------------------
/jobs/cnn/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_cnn SHARED main.cu layer.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_cnn PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_libraries(job_cnn llis_job_gpu llis_context)
4 | 


--------------------------------------------------------------------------------
/jobs/dummy_10/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_dummy_10 SHARED dummy.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_dummy_10 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_options(job_dummy_10 PRIVATE -fvisibility=hidden)
4 | target_link_libraries(job_dummy_10 llis_job_gpu llis_context)
5 | 


--------------------------------------------------------------------------------
/jobs/dummy_10/dummy.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
 9 |     notifier->start(job_id);
10 | 
11 |     //clock_t start_time = clock64();
12 |     //while (clock64() - start_time < 10000000);
13 | 
14 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
15 |     unsigned grid_size = blockDim.x * gridDim.x;
16 | 
17 |     while (id < count) {
18 |         float tmp = 1;
19 |         for (unsigned i = 1; i <= compute_count; ++i) {
20 |             tmp *= i;
21 |         }
22 |         mem[id] = tmp;
23 |         id += grid_size;
24 |     }
25 | 
26 |     notifier->end(job_id);
27 | }
28 | 
29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob {
30 |   public:
31 |     size_t get_input_size() override {
32 |         return 5;
33 |     }
34 | 
35 |     size_t get_output_size() override {
36 |         return 11;
37 |     }
38 | 
39 |     size_t get_param_size() override {
40 |         return 4;
41 |     }
42 | 
43 |     void one_time_init() override {
44 |         set_num_threads_per_block(256);
45 |         set_smem_size_per_block(0);
46 |         set_num_registers_per_thread(32);
47 |         set_num_blocks(5);
48 |         unset_is_mem();
49 | 
50 |         cudaMalloc(&mem_, count_ * sizeof(*mem_));
51 |     }
52 | 
53 |     void body(void* io_ptr) override {
54 |         for (int i = 0; i < num_kernels; ++i) {
55 |             if (i == num_kernels - 1) {
56 |                 set_pre_notify();
57 |             }
58 |             yield();
59 |             llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier();
60 |             dummy_kernel<<<get_num_blocks(), get_num_threads_per_block(), 0, get_cuda_stream()>>>(mem_, count_, compute_count_, get_id(), notifier);
61 |         }
62 |     }
63 | 
64 |   private:
65 |     float* mem_;
66 | 
67 |     static constexpr unsigned count_ = 5000000;
68 |     static constexpr unsigned compute_count_ = 100;
69 |     static constexpr unsigned num_kernels = 10;
70 | };
71 | 
72 | extern "C" {
73 | 
74 | __attribute__((visibility("default")))
75 | llis::job::Job* init_job() {
76 |     return new DummyShortCoroutineJob();
77 | }
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/jobs/dummy_11/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_dummy_11 SHARED dummy.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_dummy_11 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_options(job_dummy_11 PRIVATE -fvisibility=hidden)
4 | target_link_libraries(job_dummy_11 llis_job_gpu llis_context)
5 | 


--------------------------------------------------------------------------------
/jobs/dummy_11/dummy.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
 9 |     notifier->start(job_id);
10 | 
11 |     //clock_t start_time = clock64();
12 |     //while (clock64() - start_time < 10000000);
13 | 
14 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
15 |     unsigned grid_size = blockDim.x * gridDim.x;
16 | 
17 |     while (id < count) {
18 |         float tmp = 1;
19 |         for (unsigned i = 1; i <= compute_count; ++i) {
20 |             tmp *= i;
21 |         }
22 |         mem[id] = tmp;
23 |         id += grid_size;
24 |     }
25 | 
26 |     notifier->end(job_id);
27 | }
28 | 
29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob {
30 |   public:
31 |     size_t get_input_size() override {
32 |         return 5;
33 |     }
34 | 
35 |     size_t get_output_size() override {
36 |         return 11;
37 |     }
38 | 
39 |     size_t get_param_size() override {
40 |         return 4;
41 |     }
42 | 
43 |     void one_time_init() override {
44 |         set_num_threads_per_block(256);
45 |         set_smem_size_per_block(0);
46 |         set_num_registers_per_thread(32);
47 |         set_num_blocks(5);
48 |         unset_is_mem();
49 | 
50 |         cudaMalloc(&mem_, count_ * sizeof(*mem_));
51 |     }
52 | 
53 |     void body(void* io_ptr) override {
54 |         for (int i = 0; i < num_kernels; ++i) {
55 |             if (i == num_kernels - 1) {
56 |                 set_pre_notify();
57 |             }
58 |             yield();
59 |             llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier();
60 |             dummy_kernel<<<get_num_blocks(), get_num_threads_per_block(), 0, get_cuda_stream()>>>(mem_, count_, compute_count_, get_id(), notifier);
61 |         }
62 |     }
63 | 
64 |   private:
65 |     float* mem_;
66 | 
67 |     static constexpr unsigned count_ = 5000000;
68 |     static constexpr unsigned compute_count_ = 100;
69 |     static constexpr unsigned num_kernels = 11;
70 | };
71 | 
72 | extern "C" {
73 | 
74 | __attribute__((visibility("default")))
75 | llis::job::Job* init_job() {
76 |     return new DummyShortCoroutineJob();
77 | }
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/jobs/dummy_20/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_dummy_20 SHARED dummy.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_dummy_20 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_options(job_dummy_20 PRIVATE -fvisibility=hidden)
4 | target_link_libraries(job_dummy_20 llis_job_gpu llis_context)
5 | 


--------------------------------------------------------------------------------
/jobs/dummy_20/dummy.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
 9 |     notifier->start(job_id);
10 | 
11 |     //clock_t start_time = clock64();
12 |     //while (clock64() - start_time < 10000000);
13 | 
14 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
15 |     unsigned grid_size = blockDim.x * gridDim.x;
16 | 
17 |     while (id < count) {
18 |         float tmp = 1;
19 |         for (unsigned i = 1; i <= compute_count; ++i) {
20 |             tmp *= i;
21 |         }
22 |         mem[id] = tmp;
23 |         id += grid_size;
24 |     }
25 | 
26 |     notifier->end(job_id);
27 | }
28 | 
29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob {
30 |   public:
31 |     size_t get_input_size() override {
32 |         return 5;
33 |     }
34 | 
35 |     size_t get_output_size() override {
36 |         return 11;
37 |     }
38 | 
39 |     size_t get_param_size() override {
40 |         return 4;
41 |     }
42 | 
43 |     void one_time_init() override {
44 |         set_num_threads_per_block(256);
45 |         set_smem_size_per_block(0);
46 |         set_num_registers_per_thread(32);
47 |         set_num_blocks(5);
48 |         unset_is_mem();
49 | 
50 |         cudaMalloc(&mem_, count_ * sizeof(*mem_));
51 |     }
52 | 
53 |     void body(void* io_ptr) override {
54 |         for (int i = 0; i < num_kernels; ++i) {
55 |             if (i == num_kernels - 1) {
56 |                 set_pre_notify();
57 |             }
58 |             yield();
59 |             llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier();
60 |             dummy_kernel<<<get_num_blocks(), get_num_threads_per_block(), 0, get_cuda_stream()>>>(mem_, count_, compute_count_, get_id(), notifier);
61 |         }
62 |     }
63 | 
64 |   private:
65 |     float* mem_;
66 | 
67 |     static constexpr unsigned count_ = 5000000;
68 |     static constexpr unsigned compute_count_ = 100;
69 |     static constexpr unsigned num_kernels = 20;
70 | };
71 | 
72 | extern "C" {
73 | 
74 | __attribute__((visibility("default")))
75 | llis::job::Job* init_job() {
76 |     return new DummyShortCoroutineJob();
77 | }
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/jobs/dummy_21/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_dummy_21 SHARED dummy.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_dummy_21 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_options(job_dummy_21 PRIVATE -fvisibility=hidden)
4 | target_link_libraries(job_dummy_21 llis_job_gpu llis_context)
5 | 


--------------------------------------------------------------------------------
/jobs/dummy_21/dummy.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void dummy_kernel(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
 9 |     notifier->start(job_id);
10 | 
11 |     //clock_t start_time = clock64();
12 |     //while (clock64() - start_time < 10000000);
13 | 
14 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
15 |     unsigned grid_size = blockDim.x * gridDim.x;
16 | 
17 |     while (id < count) {
18 |         float tmp = 1;
19 |         for (unsigned i = 1; i <= compute_count; ++i) {
20 |             tmp *= i;
21 |         }
22 |         mem[id] = tmp;
23 |         id += grid_size;
24 |     }
25 | 
26 |     notifier->end(job_id);
27 | }
28 | 
29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob {
30 |   public:
31 |     size_t get_input_size() override {
32 |         return 5;
33 |     }
34 | 
35 |     size_t get_output_size() override {
36 |         return 11;
37 |     }
38 | 
39 |     size_t get_param_size() override {
40 |         return 4;
41 |     }
42 | 
43 |     void one_time_init() override {
44 |         set_num_threads_per_block(256);
45 |         set_smem_size_per_block(0);
46 |         set_num_registers_per_thread(32);
47 |         set_num_blocks(5);
48 |         unset_is_mem();
49 | 
50 |         cudaMalloc(&mem_, count_ * sizeof(*mem_));
51 |     }
52 | 
53 |     void body(void* io_ptr) override {
54 |         for (int i = 0; i < num_kernels; ++i) {
55 |             if (i == num_kernels - 1) {
56 |                 set_pre_notify();
57 |             }
58 |             yield();
59 |             llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier();
60 |             dummy_kernel<<<get_num_blocks(), get_num_threads_per_block(), 0, get_cuda_stream()>>>(mem_, count_, compute_count_, get_id(), notifier);
61 |         }
62 |     }
63 | 
64 |   private:
65 |     float* mem_;
66 | 
67 |     static constexpr unsigned count_ = 5000000;
68 |     static constexpr unsigned compute_count_ = 100;
69 |     static constexpr unsigned num_kernels = 21;
70 | };
71 | 
72 | extern "C" {
73 | 
74 | __attribute__((visibility("default")))
75 | llis::job::Job* init_job() {
76 |     return new DummyShortCoroutineJob();
77 | }
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/jobs/dummy_long/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_dummy_long SHARED dummy_long.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_dummy_long PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_options(job_dummy_long PRIVATE -fvisibility=hidden)
4 | target_link_libraries(job_dummy_long llis_job_gpu llis_context)
5 | 


--------------------------------------------------------------------------------
/jobs/dummy_long/dummy_long.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void dummy_long(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
 9 |     notifier->start(job_id);
10 | 
11 |     //clock_t start_time = clock64();
12 |     //while (clock64() - start_time < 10000000);
13 | 
14 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
15 |     unsigned grid_size = blockDim.x * gridDim.x;
16 | 
17 |     while (id < count) {
18 |         float tmp = 1;
19 |         for (unsigned i = 1; i <= compute_count; ++i) {
20 |             tmp *= i;
21 |         }
22 |         mem[id] = tmp;
23 |         id += grid_size;
24 |     }
25 | 
26 |     notifier->end(job_id);
27 | }
28 | 
29 | class DummyLongCoroutineJob : public llis::job::CoroutineJob {
30 |   public:
31 |     size_t get_input_size() override {
32 |         return 5;
33 |     }
34 | 
35 |     size_t get_output_size() override {
36 |         return 11;
37 |     }
38 | 
39 |     size_t get_param_size() override {
40 |         return 4;
41 |     }
42 | 
43 |     void one_time_init() override {
44 |         set_num_threads_per_block(256);
45 |         set_smem_size_per_block(0);
46 |         set_num_registers_per_thread(32);
47 |         set_num_blocks(5);
48 |         unset_is_mem();
49 | 
50 |         cudaMalloc(&mem_, count_ * sizeof(*mem_));
51 |     }
52 | 
53 |     void body(void* io_ptr) override {
54 |         for (int i = 0; i < 50; ++i) {
55 |             if (i == 49) {
56 |                 set_pre_notify();
57 |             }
58 |             yield();
59 |             llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier();
60 |             dummy_long<<<get_num_blocks(), get_num_threads_per_block(), 0, get_cuda_stream()>>>(mem_, count_, compute_count_, get_id(), notifier);
61 |         }
62 |     }
63 | 
64 |   private:
65 |     float* mem_;
66 | 
67 |     static constexpr unsigned count_ = 5000000;
68 |     static constexpr unsigned compute_count_ = 100;
69 | };
70 | 
71 | extern "C" {
72 | 
73 | __attribute__((visibility("default")))
74 | llis::job::Job* init_job() {
75 |     return new DummyLongCoroutineJob();
76 | }
77 | 
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/jobs/dummy_short/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_dummy_short SHARED dummy_short.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_dummy_short PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_options(job_dummy_short PRIVATE -fvisibility=hidden)
4 | target_link_libraries(job_dummy_short llis_job_gpu llis_context)
5 | 


--------------------------------------------------------------------------------
/jobs/dummy_short/dummy_short.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void dummy_short(float* mem, unsigned count, unsigned compute_count, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
 9 |     notifier->start(job_id);
10 | 
11 |     //clock_t start_time = clock64();
12 |     //while (clock64() - start_time < 10000000);
13 | 
14 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
15 |     unsigned grid_size = blockDim.x * gridDim.x;
16 | 
17 |     while (id < count) {
18 |         float tmp = 1;
19 |         for (unsigned i = 1; i <= compute_count; ++i) {
20 |             tmp *= i;
21 |         }
22 |         mem[id] = tmp;
23 |         id += grid_size;
24 |     }
25 | 
26 |     notifier->end(job_id);
27 | }
28 | 
29 | class DummyShortCoroutineJob : public llis::job::CoroutineJob {
30 |   public:
31 |     size_t get_input_size() override {
32 |         return 5;
33 |     }
34 | 
35 |     size_t get_output_size() override {
36 |         return 11;
37 |     }
38 | 
39 |     size_t get_param_size() override {
40 |         return 4;
41 |     }
42 | 
43 |     void one_time_init() override {
44 |         set_num_threads_per_block(256);
45 |         set_smem_size_per_block(0);
46 |         set_num_registers_per_thread(32);
47 |         set_num_blocks(5);
48 |         unset_is_mem();
49 | 
50 |         cudaMalloc(&mem_, count_ * sizeof(*mem_));
51 |     }
52 | 
53 |     void body(void* io_ptr) override {
54 |         for (int i = 0; i < 10; ++i) {
55 |             if (i == 9) {
56 |                 set_pre_notify();
57 |             }
58 |             yield();
59 |             llis::job::FinishedBlockNotifier* notifier = get_finished_block_notifier();
60 |             dummy_short<<<get_num_blocks(), get_num_threads_per_block(), 0, get_cuda_stream()>>>(mem_, count_, compute_count_, get_id(), notifier);
61 |         }
62 |     }
63 | 
64 |   private:
65 |     float* mem_;
66 | 
67 |     static constexpr unsigned count_ = 5000000;
68 |     static constexpr unsigned compute_count_ = 100;
69 | };
70 | 
71 | extern "C" {
72 | 
73 | __attribute__((visibility("default")))
74 | llis::job::Job* init_job() {
75 |     return new DummyShortCoroutineJob();
76 | }
77 | 
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/jobs/helloworld/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_helloworld SHARED helloworld.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_helloworld PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_libraries(job_helloworld llis_job_gpu llis_context)
4 | 


--------------------------------------------------------------------------------
/jobs/helloworld/helloworld.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void helloworld(int i, llis::JobId job_id, llis::ipc::Gpu2SchedChannel gpu2sched_channel
 9 | #ifdef LLIS_MEASURE_BLOCK_TIME
10 |         , llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel
11 | #endif
12 | ) {
13 | #ifdef LLIS_MEASURE_BLOCK_TIME
14 |     llis::job::BlockStartEndTime start_end_time;
15 |     llis::job::kernel_start(job_id, &gpu2sched_channel, &start_end_time);
16 | #else
17 |     llis::job::kernel_start(job_id, &gpu2sched_channel);
18 | #endif
19 | 
20 |     unsigned nsmid;
21 |     asm("mov.u32 %0, %nsmid;" : "=r"(nsmid));
22 |     printf("Hello world %d %d\n", i, nsmid);
23 | 
24 | #ifdef LLIS_MEASURE_BLOCK_TIME
25 |     llis::job::kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time);
26 | #else
27 |     llis::job::kernel_end(job_id, &gpu2sched_channel);
28 | #endif
29 | }
30 | 
31 | class HelloWorldJob : public llis::job::Job {
32 |   public:
33 |     size_t get_input_size() override {
34 |         return 5;
35 |     }
36 | 
37 |     size_t get_output_size() override {
38 |         return 11;
39 |     }
40 | 
41 |     size_t get_param_size() override {
42 |         return 4;
43 |     }
44 | 
45 |     void full_init(void* io_ptr) override {
46 |         io_ptr_ = io_ptr;
47 | 
48 |         num_ = 0;
49 | 
50 |         set_num_blocks(1);
51 |         set_num_threads_per_block(1);
52 |         set_smem_size_per_block(0);
53 |         set_num_registers_per_thread(32);
54 |     }
55 | 
56 |     void init(void* io_ptr) override {
57 |         io_ptr_ = io_ptr;
58 | 
59 |         num_ = 0;
60 | 
61 |         set_num_blocks(1);
62 |     }
63 | 
64 |     void run_next() override {
65 |         ++num_;
66 | 
67 |         helloworld<<<num_, 1, 0, get_cuda_stream()>>>(num_, get_id(), llis::job::Context::get_gpu2sched_channel()->fork()
68 | #ifdef LLIS_MEASURE_BLOCK_TIME
69 |                 , llis::job::Context::get_gpu2sched_block_time_channel()->fork()
70 | #endif
71 |                 );
72 | 
73 |         set_num_blocks(num_ + 1);
74 |     }
75 | 
76 |     bool has_next() const override {
77 |         return num_ < 5;
78 |     }
79 | 
80 |   private:
81 |     void* io_ptr_;
82 |     int num_ = 0;
83 | };
84 | 
85 | extern "C" {
86 | 
87 | llis::job::Job* init_job() {
88 |     return new HelloWorldJob();
89 | }
90 | 
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/jobs/helloworld_coroutine/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_helloworld_coroutine SHARED helloworld_coroutine.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_helloworld_coroutine PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_libraries(job_helloworld_coroutine llis_job_gpu llis_context)
4 | 


--------------------------------------------------------------------------------
/jobs/helloworld_coroutine/helloworld_coroutine.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/coroutine_job.h>
 3 | #include <llis/job/context.h>
 4 | #include <llis/job/instrument.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void helloworld(int i, llis::JobId job_id, llis::ipc::Gpu2SchedChannel gpu2sched_channel
 9 | #ifdef LLIS_MEASURE_BLOCK_TIME
10 |         , llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel
11 | #endif
12 | ) {
13 | #ifdef LLIS_MEASURE_BLOCK_TIME
14 |     llis::job::BlockStartEndTime start_end_time;
15 |     llis::job::kernel_start(job_id, &gpu2sched_channel, &start_end_time);
16 | #else
17 |     llis::job::kernel_start(job_id, &gpu2sched_channel);
18 | #endif
19 | 
20 |     unsigned nsmid;
21 |     asm("mov.u32 %0, %nsmid;" : "=r"(nsmid));
22 |     printf("Hello world %d %d\n", i, nsmid);
23 | 
24 | #ifdef LLIS_MEASURE_BLOCK_TIME
25 |     llis::job::kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time);
26 | #else
27 |     llis::job::kernel_end(job_id, &gpu2sched_channel);
28 | #endif
29 | }
30 | 
31 | class HelloWorldCoroutineJob : public llis::job::CoroutineJob {
32 |   public:
33 |     size_t get_input_size() override {
34 |         return 5;
35 |     }
36 | 
37 |     size_t get_output_size() override {
38 |         return 11;
39 |     }
40 | 
41 |     size_t get_param_size() override {
42 |         return 4;
43 |     }
44 | 
45 |     void one_time_init() override {
46 |         set_num_threads_per_block(1);
47 |         set_smem_size_per_block(0);
48 |         set_num_registers_per_thread(32);
49 |     }
50 | 
51 |     void body(void* io_ptr) override {
52 |         io_ptr_ = io_ptr;
53 | 
54 |         for (int i = 1; i <= 5; ++i) {
55 |             set_num_blocks(i);
56 | 
57 |             yield();
58 |             helloworld<<<i, 1, 0, get_cuda_stream()>>>(i, get_id(), llis::job::Context::get_gpu2sched_channel()->fork()
59 | #ifdef LLIS_MEASURE_BLOCK_TIME
60 |                 , llis::job::Context::get_gpu2sched_block_time_channel()->fork()
61 | #endif
62 |                 );
63 |         }
64 |     }
65 | 
66 |   private:
67 |     void* io_ptr_;
68 | };
69 | 
70 | extern "C" {
71 | 
72 | llis::job::Job* init_job() {
73 |     return new HelloWorldCoroutineJob();
74 | }
75 | 
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/jobs/run_forever/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_run_forever SHARED run_forever.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_run_forever PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_libraries(job_run_forever llis_job_gpu llis_context)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/run_forever/run_forever.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/job.h>
 3 | #include <llis/job/instrument.h>
 4 | #include <llis/job/context.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | __global__ void run(int n, llis::JobId job_id, llis::ipc::Gpu2SchedChannel gpu2sched_channel
 9 | #ifdef LLIS_MEASURE_BLOCK_TIME
10 |         , llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel
11 | #endif
12 | ) {
13 | #ifdef LLIS_MEASURE_BLOCK_TIME
14 |     llis::job::BlockStartEndTime start_end_time;
15 |     llis::job::kernel_start(job_id, &gpu2sched_channel, &start_end_time);
16 | #else
17 |     llis::job::kernel_start(job_id, &gpu2sched_channel);
18 | #endif
19 | 
20 |     printf("run_forever %u\n", job_id);
21 | 
22 |     if (n < 5) {
23 |         volatile int i;
24 |         for (i = 0; i < n * 1000; ++i);
25 |     } else {
26 |         while (true);
27 |     }
28 | 
29 | #ifdef LLIS_MEASURE_BLOCK_TIME
30 |     llis::job::kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time);
31 | #else
32 |     llis::job::kernel_end(job_id, &gpu2sched_channel);
33 | #endif
34 | }
35 | 
36 | class RunForeverJob : public llis::job::Job {
37 |   public:
38 |     size_t get_input_size() override {
39 |         return 5;
40 |     }
41 | 
42 |     size_t get_output_size() override {
43 |         return 11;
44 |     }
45 | 
46 |     size_t get_param_size() override {
47 |         return 4;
48 |     }
49 | 
50 |     void full_init(void* io_ptr) override {
51 |         io_ptr_ = io_ptr;
52 | 
53 |         num_ = 0;
54 | 
55 |         set_num_blocks(1);
56 |         set_num_threads_per_block(1);
57 |         set_smem_size_per_block(0);
58 |         set_num_registers_per_thread(32);
59 |     }
60 | 
61 |     void init(void* io_ptr) override {
62 |         io_ptr_ = io_ptr;
63 | 
64 |         num_ = 0;
65 | 
66 |         set_num_blocks(1);
67 |     }
68 | 
69 |     void run_next() override {
70 |         ++num_;
71 | 
72 |         run<<<num_, 1, 0, get_cuda_stream()>>>(num_, get_id(), llis::job::Context::get_gpu2sched_channel()->fork()
73 | #ifdef LLIS_MEASURE_BLOCK_TIME
74 |                 , llis::job::Context::get_gpu2sched_block_time_channel()->fork()
75 | #endif
76 |                 );
77 | 
78 |         set_num_blocks(num_ + 1);
79 |     }
80 | 
81 |     bool has_next() const override {
82 |         return num_ < 5;
83 |     }
84 | 
85 |   private:
86 |     void* io_ptr_;
87 |     int num_ = 0;
88 | };
89 | 
90 | extern "C" {
91 | 
92 | llis::job::Job* init_job() {
93 |     return new RunForeverJob();
94 | }
95 | 
96 | }
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/jobs/tvm_arcfaceresnet100/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_arcfaceresnet100 SHARED tvm_arcfaceresnet100.cpp)
2 | target_link_libraries(job_tvm_arcfaceresnet100 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_arcfaceresnet100 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_arcfaceresnet100/tvm_arcfaceresnet100.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMArcfaceresnet100 : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 112*112*3 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 512 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "arcfaceresnet100-8-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input(0);
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMArcfaceresnet100();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/tvm_densenet121/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_densenet121 SHARED tvm_densenet121.cpp)
2 | target_link_libraries(job_tvm_densenet121 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_densenet121 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_densenet121/tvm_densenet121.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMDensenet121Job : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 224*224*3 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 1000 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "densenet-9-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input(0);
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMDensenet121Job();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/tvm_googlenet/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_googlenet SHARED tvm_googlenet.cpp)
2 | target_link_libraries(job_tvm_googlenet tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_googlenet DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_googlenet/tvm_googlenet.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMGooglenetJob : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 224*224*3 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 1000 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "googlenet-9-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input(0);
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMGooglenetJob();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/tvm_inception_v3/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_inception_v3 SHARED tvm_inception_v3.cpp)
2 | target_link_libraries(job_tvm_inception_v3 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_inception_v3 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_inception_v3/tvm_inception_v3.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/utils/path.h>
 3 | 
 4 | #include <tvm/runtime/ndarray.h>
 5 | #include <tvm/runtime/module.h>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | class TVMInceptionV3Job : public llis::job::CoroutineJob {
10 |   public:
11 |     size_t get_input_size() override {
12 |         return 224*224*3 * sizeof(float);
13 |     }
14 | 
15 |     size_t get_output_size() override {
16 |         return 1000 * sizeof(float);
17 |     }
18 | 
19 |     size_t get_param_size() override {
20 |         return 4;
21 |     }
22 | 
23 |     void one_time_init() override {
24 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
25 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "inception_v3-cuda_llis-pack.so"));
26 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
27 |         run_ = gmod_.GetFunction("run");
28 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
29 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
30 |         input_dev = get_input("input_1");
31 |         output_dev = get_output(0);
32 |     }
33 | 
34 |     void body(void* io_ptr) override {
35 |         set_is_mem();
36 |         yield();
37 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
38 |         unset_is_mem();
39 | 
40 |         run_();
41 | 
42 |         set_is_mem();
43 |         set_pre_notify();
44 |         yield();
45 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
46 |     }
47 | 
48 |   private:
49 |     DLDevice ctx_gpu_;
50 |     tvm::runtime::Module mod_factory_;
51 |     tvm::runtime::Module gmod_;
52 |     tvm::runtime::PackedFunc run_;
53 |     tvm::runtime::NDArray input_dev;
54 |     tvm::runtime::NDArray output_dev;
55 | };
56 | 
57 | extern "C" {
58 | 
59 | llis::job::Job* init_job() {
60 |     return new TVMInceptionV3Job();
61 | }
62 | 
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/jobs/tvm_mnist/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_mnist SHARED tvm_mnist.cpp)
2 | target_link_libraries(job_tvm_mnist tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_mnist DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_mnist/tvm_mnist.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMMnistJob : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 28*28 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 10 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "mnist-8-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input("Input3");
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMMnistJob();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/tvm_mobilenet/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_mobilenet SHARED tvm_mobilenet.cpp)
2 | target_link_libraries(job_tvm_mobilenet tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_mobilenet DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_mobilenet/tvm_mobilenet.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMMobilenetJob : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 224*224*3 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 1000 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "mobilenetv2-7-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input(0);
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMMobilenetJob();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/tvm_resnet18/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_resnet18 SHARED tvm_resnet18.cpp)
2 | target_link_libraries(job_tvm_resnet18 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_resnet18 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_resnet18/tvm_resnet18.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/utils/path.h>
 3 | 
 4 | #include <tvm/runtime/ndarray.h>
 5 | #include <tvm/runtime/module.h>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | class TVMResnet18Job : public llis::job::CoroutineJob {
10 |   public:
11 |     size_t get_input_size() override {
12 |         return 224*224*3 * sizeof(float);
13 |     }
14 | 
15 |     size_t get_output_size() override {
16 |         return 1000 * sizeof(float);
17 |     }
18 | 
19 |     size_t get_param_size() override {
20 |         return 4;
21 |     }
22 | 
23 |     void one_time_init() override {
24 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
25 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "resnet18-v2-7-cuda_llis-pack.so"));
26 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
27 |         run_ = gmod_.GetFunction("run");
28 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
29 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
30 |         input_dev = get_input(0);
31 |         output_dev = get_output(0);
32 |     }
33 | 
34 |     void body(void* io_ptr) override {
35 |         set_is_mem();
36 |         yield();
37 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
38 |         unset_is_mem();
39 | 
40 |         run_();
41 | 
42 |         set_is_mem();
43 |         set_pre_notify();
44 |         yield();
45 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
46 |     }
47 | 
48 |   private:
49 |     DLDevice ctx_gpu_;
50 |     tvm::runtime::Module mod_factory_;
51 |     tvm::runtime::Module gmod_;
52 |     tvm::runtime::PackedFunc run_;
53 |     tvm::runtime::NDArray input_dev;
54 |     tvm::runtime::NDArray output_dev;
55 | };
56 | 
57 | extern "C" {
58 | 
59 | llis::job::Job* init_job() {
60 |     return new TVMResnet18Job();
61 | }
62 | 
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/jobs/tvm_resnet34/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_resnet34 SHARED tvm_resnet34.cpp)
2 | target_link_libraries(job_tvm_resnet34 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_resnet34 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_resnet34/tvm_resnet34.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/utils/path.h>
 3 | 
 4 | #include <tvm/runtime/ndarray.h>
 5 | #include <tvm/runtime/module.h>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | class TVMResnet34Job : public llis::job::CoroutineJob {
10 |   public:
11 |     size_t get_input_size() override {
12 |         return 224*224*3 * sizeof(float);
13 |     }
14 | 
15 |     size_t get_output_size() override {
16 |         return 1000 * sizeof(float);
17 |     }
18 | 
19 |     size_t get_param_size() override {
20 |         return 4;
21 |     }
22 | 
23 |     void one_time_init() override {
24 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
25 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "resnet34-v2-7-cuda_llis-pack.so"));
26 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
27 |         run_ = gmod_.GetFunction("run");
28 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
29 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
30 |         input_dev = get_input(0);
31 |         output_dev = get_output(0);
32 |     }
33 | 
34 |     void body(void* io_ptr) override {
35 |         set_is_mem();
36 |         yield();
37 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
38 |         unset_is_mem();
39 | 
40 |         run_();
41 | 
42 |         set_is_mem();
43 |         set_pre_notify();
44 |         yield();
45 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
46 |     }
47 | 
48 |   private:
49 |     DLDevice ctx_gpu_;
50 |     tvm::runtime::Module mod_factory_;
51 |     tvm::runtime::Module gmod_;
52 |     tvm::runtime::PackedFunc run_;
53 |     tvm::runtime::NDArray input_dev;
54 |     tvm::runtime::NDArray output_dev;
55 | };
56 | 
57 | extern "C" {
58 | 
59 | llis::job::Job* init_job() {
60 |     return new TVMResnet34Job();
61 | }
62 | 
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/jobs/tvm_resnet50/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_resnet50 SHARED tvm_resnet50.cpp)
2 | target_link_libraries(job_tvm_resnet50 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_resnet50 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_resnet50/tvm_resnet50.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMResnet50 : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 224*224*3 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 1000 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "resnet50-v2-7-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input(0);
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMResnet50();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/tvm_squeezenet1_1/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_squeezenet1_1 SHARED tvm_squeezenet1_1.cpp)
2 | target_link_libraries(job_tvm_squeezenet1_1 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_squeezenet1_1 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_squeezenet1_1/tvm_squeezenet1_1.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/utils/path.h>
 3 | 
 4 | #include <tvm/runtime/ndarray.h>
 5 | #include <tvm/runtime/module.h>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | class TVMSqueezeNet11Job : public llis::job::CoroutineJob {
10 |   public:
11 |     size_t get_input_size() override {
12 |         return 224*224*3 * sizeof(float);
13 |     }
14 | 
15 |     size_t get_output_size() override {
16 |         return 1000 * sizeof(float);
17 |     }
18 | 
19 |     size_t get_param_size() override {
20 |         return 4;
21 |     }
22 | 
23 |     void one_time_init() override {
24 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
25 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "squeezenet1.1-7-cuda_llis-pack.so"));
26 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
27 |         run_ = gmod_.GetFunction("run");
28 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
29 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
30 |         input_dev = get_input(0);
31 |         output_dev = get_output(0);
32 |     }
33 | 
34 |     void body(void* io_ptr) override {
35 |         set_is_mem();
36 |         yield();
37 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
38 |         unset_is_mem();
39 | 
40 |         run_();
41 | 
42 |         set_is_mem();
43 |         set_pre_notify();
44 |         yield();
45 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
46 |     }
47 | 
48 |   private:
49 |     DLDevice ctx_gpu_;
50 |     tvm::runtime::Module mod_factory_;
51 |     tvm::runtime::Module gmod_;
52 |     tvm::runtime::PackedFunc run_;
53 |     tvm::runtime::NDArray input_dev;
54 |     tvm::runtime::NDArray output_dev;
55 | };
56 | 
57 | extern "C" {
58 | 
59 | llis::job::Job* init_job() {
60 |     return new TVMSqueezeNet11Job();
61 | }
62 | 
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/jobs/tvm_ultraface320/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_tvm_ultraface320 SHARED tvm_ultraface320.cpp)
2 | target_link_libraries(job_tvm_ultraface320 tvm::tvm_runtime llis_job llis_context)
3 | install(TARGETS job_tvm_ultraface320 DESTINATION lib/llis_jobs)
4 | 
5 | 


--------------------------------------------------------------------------------
/jobs/tvm_ultraface320/tvm_ultraface320.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/job/coroutine_job.h>
 2 | #include <llis/job/context.h>
 3 | #include <llis/utils/path.h>
 4 | 
 5 | #include <tvm/runtime/module.h>
 6 | #include <tvm/runtime/ndarray.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | class TVMUltraface320Job : public llis::job::CoroutineJob {
11 |   public:
12 |     size_t get_input_size() override {
13 |         return 240*320*3 * sizeof(float);
14 |     }
15 | 
16 |     size_t get_output_size() override {
17 |         return 4420*2 * sizeof(float);
18 |     }
19 | 
20 |     size_t get_param_size() override {
21 |         return 4;
22 |     }
23 | 
24 |     void one_time_init() override {
25 |         ctx_gpu_ = DLDevice{kDLCUDA, 0};
26 |         mod_factory_ = tvm::runtime::Module::LoadFromFile(llis::utils::path_concat(std::getenv("LLIS_MODELS_DIR"), "ultraface320-cuda_llis-pack.so"));
27 |         gmod_ = mod_factory_.GetFunction("default")(ctx_gpu_);
28 |         run_ = gmod_.GetFunction("run");
29 |         tvm::runtime::PackedFunc get_input = gmod_.GetFunction("get_input");
30 |         tvm::runtime::PackedFunc get_output = gmod_.GetFunction("get_output");
31 |         input_dev = get_input(0);
32 |         output_dev = get_output(0);
33 |     }
34 | 
35 |     void body(void* io_ptr) override {
36 |         set_is_mem();
37 |         yield();
38 |         cudaMemcpyAsync(input_dev->data, io_ptr, get_input_size(), cudaMemcpyHostToDevice, get_cuda_stream());
39 |         unset_is_mem();
40 | 
41 |         run_();
42 | 
43 |         set_is_mem();
44 |         set_pre_notify();
45 |         yield();
46 |         cudaMemcpyAsync((char*)io_ptr + get_input_size(), input_dev->data, get_output_size(), cudaMemcpyDeviceToHost, get_cuda_stream());
47 |     }
48 | 
49 |   private:
50 |     DLDevice ctx_gpu_;
51 |     tvm::runtime::Module mod_factory_;
52 |     tvm::runtime::Module gmod_;
53 |     tvm::runtime::PackedFunc run_;
54 |     tvm::runtime::NDArray input_dev;
55 |     tvm::runtime::NDArray output_dev;
56 | };
57 | 
58 | extern "C" {
59 | 
60 | llis::job::Job* init_job() {
61 |     return new TVMUltraface320Job();
62 | }
63 | 
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/jobs/vec_add_coroutine/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(job_vec_add_coroutine SHARED vec_add_coroutine.cu $<TARGET_OBJECTS:ipc-gpu>)
2 | set_target_properties(job_vec_add_coroutine PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3 | target_link_libraries(job_vec_add_coroutine llis_job_gpu llis_context)
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/gen_data_fig11_cuda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | install_path=/bigdisk/opt/llis
 4 | models_dir=/bigdisk/models/cuda
 5 | res_dir=/bigdisk/results-cuda
 6 | 
 7 | while getopts 'p:m:o:' opt; do
 8 |   case "$opt" in
 9 |     p)
10 |       install_path="$OPTARG"
11 |       ;;
12 | 
13 |     m)
14 |       models_dir="$OPTARG"
15 |       ;;
16 | 
17 |     o)
18 |       res_dir="$OPTARG"
19 |       ;;
20 |    
21 |     ?|h)
22 |       echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]"
23 |       exit 1
24 |       ;;
25 |   esac
26 | done
27 | shift "$(($OPTIND -1))"
28 | 
29 | export CUDA_DEVICE_MAX_CONNECTIONS=32
30 | 
31 | mkdir -p "${res_dir}"
32 | 
33 | for ln_sigma in {1.5,2}; do
34 |     for sched in {CUDA-SS,CUDA-MS}; do
35 |         case $sched in
36 |             CUDA-SS)
37 |                 num_streams=1
38 |                 suffix=_cudass
39 |                 ;;
40 |             CUDA-MS)
41 |                 num_streams=141
42 |                 suffix=_cudams
43 |                 ;;
44 |         esac
45 | 
46 |         for i in {1000,1053,1111,1176,1250,1333,1429,1538,1667,1818,2000,2222,2500,2857,3333,4000,5000,6667,10000,20000,40000,80000,160000}; do
47 |             "${install_path}"/bin/tvm_direct_multistream \
48 |                 --iat $i \
49 |                 --ln_sigma $ln_sigma \
50 |                 --start_record_num 0 \
51 |                 --seed 1 \
52 |                 --prefix "${res_dir}/all_prop${suffix}" \
53 |                 --iat_n \
54 |                 --iat_g \
55 |                 --ln_sigma_n \
56 |                 --num_jobs 3000 \
57 |                 --concurrency $num_streams \
58 |                 "${models_dir}/mobilenetv2-7-cuda-pack.so" 0.257 36 \
59 |                 "${models_dir}/densenet-9-cuda-pack.so" 0.0706 10 \
60 |                 "${models_dir}/googlenet-9-cuda-pack.so" 0.0546 8 \
61 |                 "${models_dir}/inception_v3-cuda-pack.so" 0.0138 2 \
62 |                 "${models_dir}/resnet18-v2-7-cuda-pack.so" 0.272 38 \
63 |                 "${models_dir}/resnet34-v2-7-cuda-pack.so" 0.168 24 \
64 |                 "${models_dir}/resnet50-v2-7-cuda-pack.so" 0.0745 10 \
65 |                 "${models_dir}/squeezenet1.1-7-cuda-pack.so" 0.0894999999999999 13
66 |         done
67 |     done
68 | done
69 | 
70 | 


--------------------------------------------------------------------------------
/sosp23_artifact/gen_data_fig11_triton.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | client_path=/bigdisk/src/triton-client-llis
 4 | res_dir=/bigdisk/results-triton/
 5 | 
 6 | while getopts 'p:o:' opt; do
 7 |   case "$opt" in
 8 |     p)
 9 |       client_path="$OPTARG"
10 |       ;;
11 | 
12 |     o)
13 |       res_dir="$OPTARG"
14 |       ;;
15 |    
16 |     ?|h)
17 |       echo "Usage: $(basename $0) [-p client_path] [-o output_dir]"
18 |       exit 1
19 |       ;;
20 |   esac
21 | done
22 | shift "$(($OPTIND -1))"
23 | 
24 | mkdir -p $res_dir/1.5
25 | mkdir -p $res_dir/2
26 | 
27 | $client_path/run.py -b $client_path/build/cc-clients/examples/grpc_async_infer_client_mixed -o $res_dir/ 10 50 10 $client_path/schedules/newmix3_sops23.yaml
28 | 
29 | 


--------------------------------------------------------------------------------
/sosp23_artifact/gen_data_fig12_cuda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | install_path=/bigdisk/opt/llis
 4 | models_dir=/bigdisk/models/cuda
 5 | res_dir=/bigdisk/results-cuda
 6 | 
 7 | while getopts 'p:m:o:' opt; do
 8 |   case "$opt" in
 9 |     p)
10 |       install_path="$OPTARG"
11 |       ;;
12 | 
13 |     m)
14 |       models_dir="$OPTARG"
15 |       ;;
16 | 
17 |     o)
18 |       res_dir="$OPTARG"
19 |       ;;
20 |    
21 |     ?|h)
22 |       echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]"
23 |       exit 1
24 |       ;;
25 |   esac
26 | done
27 | shift "$(($OPTIND -1))"
28 | 
29 | export CUDA_DEVICE_MAX_CONNECTIONS=32
30 | 
31 | mkdir -p "${res_dir}"
32 | 
33 | for ln_sigma in {1.5,2}; do
34 |     for sched in {CUDA-SS,CUDA-MS}; do
35 |         case $sched in
36 |             CUDA-SS)
37 |                 num_streams=1
38 |                 suffix=_cudass
39 |                 ;;
40 |             CUDA-MS)
41 |                 num_streams=125
42 |                 suffix=_cudams
43 |                 ;;
44 |         esac
45 | 
46 |         for i in {2000,2222,2500,2857,3333,4000,5000,6667,10000,20000,40000}; do
47 |             "${install_path}"/bin/tvm_direct_multistream \
48 |                 --iat $i \
49 |                 --ln_sigma $ln_sigma \
50 |                 --start_record_num 0 \
51 |                 --seed 1 \
52 |                 --prefix "${res_dir}/resnet18_inception_v3_prop${suffix}" \
53 |                 --iat_n \
54 |                 --iat_g \
55 |                 --ln_sigma_n \
56 |                 --num_jobs 3000 \
57 |                 --concurrency $num_streams \
58 |                 "${models_dir}/resnet18-v2-7-cuda-pack.so" 0.952 119 \
59 |                 "${models_dir}/inception_v3-cuda-pack.so" 0.048 6
60 |         done
61 |     done
62 | done
63 | 


--------------------------------------------------------------------------------
/sosp23_artifact/gen_data_fig12_paella.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | install_path=/bigdisk/opt/llis
 4 | export LLIS_MODELS_DIR=/bigdisk/models/cuda_llis
 5 | res_dir=/bigdisk/results
 6 | 
 7 | while getopts 'p:m:o:' opt; do
 8 |   case "$opt" in
 9 |     p)
10 |       install_path="$OPTARG"
11 |       ;;
12 | 
13 |     m)
14 |       export LLIS_MODELS_DIR="$OPTARG"
15 |       ;;
16 | 
17 |     o)
18 |       res_dir="$OPTARG"
19 |       ;;
20 |    
21 |     ?|h)
22 |       echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]"
23 |       exit 1
24 |       ;;
25 |   esac
26 | done
27 | shift "$(($OPTIND -1))"
28 | 
29 | export CUDA_DEVICE_MAX_CONNECTIONS=32
30 | 
31 | mkdir -p "${res_dir}"
32 | 
33 | SERVER_PID=0
34 | 
35 | trap "kill $SERVER_PID; exit" INT
36 | 
37 | for ln_sigma in {1.5,2}; do
38 |     for sched in {SS,MS-jbj,MS-kbk,Full}; do
39 |         case $sched in
40 |             SS)
41 |                 sched=fifo
42 |                 num_streams=1
43 |                 suffix=_ss
44 |                 ;;
45 |             MS-jbj)
46 |                 sched=fifo
47 |                 num_streams=500
48 |                 suffix=_msjbj
49 |                 ;;
50 |             MS-kbk)
51 |                 sched=fifo2
52 |                 num_streams=500
53 |                 suffix=_mskbk
54 |                 ;;
55 |             Full)
56 |                 sched=full3
57 |                 num_streams=500
58 |                 suffix=_full
59 |                 ;;
60 |         esac
61 | 
62 |         for i in {2222,2500,2857,3333,4000,5000,6667,10000,20000,40000}; do
63 |             taskset -c 4 "${install_path}"/bin/llis_server \
64 |                 --name server \
65 |                 --sched $sched \
66 |                 --num_streams $num_streams &
67 |             SERVER_PID=$!
68 |             sleep 5
69 | 
70 |             "${install_path}"/bin/llis_app_client \
71 |                 --server_name server \
72 |                 --iat $i \
73 |                 --ln_sigma $ln_sigma \
74 |                 --start_record_num 0 \
75 |                 --seed 1 \
76 |                 --prefix "${res_dir}/resnet18_inception_v3_prop${suffix}" \
77 |                 --fairness 1000000 \
78 |                 --iat_n \
79 |                 --iat_g \
80 |                 --ln_sigma_n \
81 |                 --num_jobs 3000 \
82 |                 --concurrency 125 \
83 |                 "${install_path}/lib/llis_jobs/libjob_tvm_resnet18.so" 0.952 119 \
84 |                 "${install_path}/lib/llis_jobs/libjob_tvm_inception_v3.so" 0.048 6
85 |             wait
86 |         done
87 |     done
88 | done
89 | 


--------------------------------------------------------------------------------
/sosp23_artifact/gen_data_fig13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | install_path=/bigdisk/opt/llis
 4 | export LLIS_MODELS_DIR=/bigdisk/models/cuda_llis
 5 | res_dir=/bigdisk/results
 6 | 
 7 | while getopts 'p:m:o:' opt; do
 8 |   case "$opt" in
 9 |     p)
10 |       install_path="$OPTARG"
11 |       ;;
12 | 
13 |     m)
14 |       export LLIS_MODELS_DIR="$OPTARG"
15 |       ;;
16 | 
17 |     o)
18 |       res_dir="$OPTARG"
19 |       ;;
20 |    
21 |     ?|h)
22 |       echo "Usage: $(basename $0) [-p install_path] [-m model_dir] [-o output_dir]"
23 |       exit 1
24 |       ;;
25 |   esac
26 | done
27 | shift "$(($OPTIND -1))"
28 | 
29 | export CUDA_DEVICE_MAX_CONNECTIONS=32
30 | 
31 | mkdir -p "${res_dir}"
32 | 
33 | SERVER_PID=0
34 | 
35 | trap "kill $SERVER_PID; exit" INT
36 | 
37 | for f in {0.01,0.1,1,10,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000,10000,100000}; do
38 |     taskset -c 4 "${install_path}"/bin/llis_server \
39 |         --name server \
40 |         --sched full3 \
41 |         --unfair $f \
42 |         --num_streams 500 &
43 |     SERVER_PID=$!
44 |     sleep 5
45 | 
46 |     "${install_path}"/bin/llis_app_client \
47 |         --server_name server \
48 |         --iat 0 \
49 |         --ln_sigma 2 \
50 |         --start_record_num 0 \
51 |         --seed 1 \
52 |         --prefix "${res_dir}/resnet18_inception_v3_prop_full_fairness" \
53 |         --fairness $f \
54 |         --fairness_n \
55 |         --fairness_g \
56 |         --ln_sigma_n \
57 |         --num_jobs 3000 \
58 |         --concurrency 125 \
59 |         "${install_path}/lib/llis_jobs/libjob_tvm_resnet18.so" 0.952 119 \
60 |         "${install_path}/lib/llis_jobs/libjob_tvm_inception_v3.so" 0.048 6
61 |     wait
62 | done
63 | 
64 | 


--------------------------------------------------------------------------------
/sosp23_artifact/plot_fig13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd "$(dirname "$0")"
 4 | script_path=`pwd`
 5 | cd -
 6 | 
 7 | paella_res_dir=/bigdisk/results
 8 | output_dir=/bigdisk/graphs
 9 | 
10 | while getopts 'p:c:t:o:' opt; do
11 |   case "$opt" in
12 |     p)
13 |       paella_res_dir="$OPTARG"
14 |       ;;
15 | 
16 |     o)
17 |       output_dir="$OPTARG"
18 |       ;;
19 |    
20 |     ?|h)
21 |       echo "Usage: $(basename $0) [-p paella_res_dir] [-o output_dir]"
22 |       exit 1
23 |       ;;
24 |   esac
25 | done
26 | shift "$(($OPTIND -1))"
27 | 
28 | mkdir -p ${output_dir}
29 | 
30 | python3 $script_path/tools/plot_latency_fairness_threshold.py \
31 |     -o $output_dir/fig13.pdf \
32 |     -i $paella_res_dir/resnet18_inception_v3_prop_full_fairness_lns2.txt \
33 |     -a 'Paella' \
34 |     -l 1 -n ResNet-18 \
35 |     -l 2 -n InceptionV3 \
36 |     --yaxis Mean
37 | 
38 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | *Note that you should skip this part if you are using the machine we provided, as it already has the environment set up.*
 4 | 
 5 | The following scripts install stuff in the `/bigdisk` directory by default. If you want to change the installation prefix, you can use the `-p` argument (e.g., `./install_dependencies.sh -p /my_dir`).
 6 | 
 7 | First, please download the onnx models from [here](https://drive.google.com/file/d/1AAI1lwGBT6CnLx_q24z8vhqKh1g8mT3g/view?usp=sharing) and extract it to `<installation_prefix>/models` (`/bigdisk/models` by default). The setup scripts below depend on the onnx models.
 8 | 
 9 | Then, `cd` into the `setup/` directory if you have not done so yet.
10 | 
11 | 1. Run `./install_dependencies.sh` to setup the environment.
12 | 
13 | 1. Run `./install_llis_tvm.sh`, which install Paella and the custom TVM modified for Paella. Also, it will compile the models with TVM.
14 | 
15 | 1. Run `./build_triton_docker.sh` to build the docker with Triton server.
16 | 
17 | 1. Run `./install_triton_client.sh` to install the Triton client.
18 | 
19 | *After running all scripts, either `source ~/.bash_profile` or logout and then log back in to ensure that the environment variables are set.*
20 | 
21 | ## Reset
22 | 
23 | We have already done the setup process on the machine we provided. However, if you want to start from scratch, run `./reset_all.sh`.
24 | 
25 | If you only want to install Paella and the custom TVM from scratch, while keeping other dependencies untouched, run `./reset_llis_tvm.sh` and then run `./install_llis_tvm.sh` again.
26 | 
27 | *Either `source ~/.bash_profile` or logout and then log back in to ensure that the environment variables are set.*
28 | 
29 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/build_triton_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PREFIX=/bigdisk
 4 | 
 5 | while getopts 'p:' opt; do
 6 |   case "$opt" in
 7 |     p)
 8 |       PREFIX="$OPTARG"
 9 |       ;;
10 | 
11 |     ?|h)
12 |       echo "Usage: $(basename $0) [-p PREFIX]"
13 |       exit 1
14 |       ;;
15 |   esac
16 | done
17 | 
18 | source /etc/profile
19 | source ~/.bash_profile
20 | 
21 | cd "$(dirname "$0")"
22 | abs_path="`pwd`"
23 | 
24 | # Get TVM source
25 | 
26 | cd "${PREFIX}/src"
27 | 
28 | git clone --recursive https://github.com/eniac/tvm-llis.git tvm-tf
29 | cd tvm-tf
30 | git checkout v0.10.0-llis
31 | git submodule update --recursive
32 | 
33 | # Compile TVM for TF and Convert TVM models to TF models
34 | 
35 | sudo docker run --gpus=1 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
36 |                 -v${abs_path}/..:/workspace/sosp23_artifact \
37 |                 -v"${PREFIX}"/models:/workspace/models \
38 |                 -v"${PREFIX}"/src/tvm-tf:/workspace/src/tvm-tf \
39 |                 nvcr.io/nvidia/tensorflow:23.03-tf2-py3 \
40 |                 /workspace/sosp23_artifact/setup/triton_docker/run_on_tf_docker.sh # Use convert_tvm_to_tf.sh instead if building TVM is not necessary
41 | 
42 | sudo rsync -a ${abs_path}/../tf_models_config/ ${PREFIX}/models/tensorflow/
43 | 
44 | # Build docker image
45 | 
46 | cd ${abs_path}/triton_docker
47 | 
48 | mkdir -p models
49 | sudo mount --bind ${PREFIX}/models models
50 | 
51 | mkdir -p sosp23_artifact 
52 | sudo mount --bind ../../../sosp23_artifact sosp23_artifact
53 | 
54 | mkdir -p tvm-tf
55 | sudo mount --bind ${PREFIX}/src/tvm-tf tvm-tf
56 | 
57 | sudo docker build -t triton_server_tvm .
58 | 
59 | sudo umount models
60 | sudo umount sosp23_artifact
61 | sudo umount tvm-tf
62 | 
63 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/install_triton_client.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PREFIX=/bigdisk
 4 | 
 5 | while getopts 'p:' opt; do
 6 |   case "$opt" in
 7 |     p)
 8 |       PREFIX="$OPTARG"
 9 |       ;;
10 | 
11 |     ?|h)
12 |       echo "Usage: $(basename $0) [-p PREFIX]"
13 |       exit 1
14 |       ;;
15 |   esac
16 | done
17 | 
18 | source /etc/profile
19 | source ~/.bash_profile
20 | 
21 | sudo chown $USER "${PREFIX}"
22 | 
23 | mkdir -p "${PREFIX}/src"
24 | 
25 | cd "${PREFIX}/src"
26 | 
27 | sudo apt-get install -y curl libcurl4-openssl-dev libb64-dev libssl-dev zlib1g-dev rapidjson-dev libopencv-dev libyaml-cpp-dev
28 | pip install PyYAML
29 | 
30 | git clone https://github.com/maxdml/triton-client.git triton-client-llis
31 | cd triton-client-llis
32 | git checkout sosp23_artifact
33 | mkdir build
34 | cd build
35 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=OFF -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON ..
36 | make -j40
37 | 
38 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/onnx2tvm.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | import numpy as np
 3 | import tvm
 4 | from tvm import te
 5 | import tvm.relay as relay
 6 | from tvm.contrib.download import download_testdata
 7 | import sys
 8 | 
 9 | model_path = sys.argv[1] # path to the onnx model
10 | output_path = sys.argv[2] # path to output the tvm-compiled model
11 | target = sys.argv[3] # cuda or cuda -llis_flag=[1|3|5]
12 | input_dims = tuple(int(x) for x in sys.argv[4:]) # e.g., `1 3 224 224` for mobilenet
13 | 
14 | onnx_model = onnx.load(model_path)
15 | 
16 | input_names_all = [node.name for node in onnx_model.graph.input]
17 | input_initializer =  [node.name for node in onnx_model.graph.initializer]
18 | input_names = list(set(input_names_all)  - set(input_initializer))
19 | 
20 | input_name = input_names[0]
21 | 
22 | shape_dict = {input_name: input_dims}
23 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
24 | 
25 | opt_level = 3
26 | with tvm.transform.PassContext(opt_level=opt_level):
27 |     lib = relay.build(mod, target, params=params)
28 | 
29 | lib.export_library(output_path)
30 | 
31 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/onnx2tvm_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ONNX_DIR=$1
 4 | DEST_DIR=$2
 5 | TYPE=$3
 6 | 
 7 | if [[ $TYPE == "cuda_llis" ]]; then
 8 |     TYPE2='cuda -llis_flag=3'
 9 | else
10 |     TYPE2='cuda'
11 | fi
12 | 
13 | mkdir -p $DEST_DIR
14 | 
15 | python3 onnx2tvm.py ${ONNX_DIR}/mnist-8.onnx ${DEST_DIR}/mnist-8-${TYPE}-pack.so "${TYPE2}" 1 1 28 28
16 | python3 onnx2tvm.py ${ONNX_DIR}/densenet-9.onnx ${DEST_DIR}/densenet-9-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
17 | python3 onnx2tvm.py ${ONNX_DIR}/googlenet-9.onnx ${DEST_DIR}/googlenet-9-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
18 | python3 onnx2tvm.py ${ONNX_DIR}/mobilenetv2-7.onnx ${DEST_DIR}/mobilenetv2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
19 | python3 onnx2tvm.py ${ONNX_DIR}/resnet18-v2-7.onnx ${DEST_DIR}/resnet18-v2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
20 | python3 onnx2tvm.py ${ONNX_DIR}/resnet34-v2-7.onnx ${DEST_DIR}/resnet34-v2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
21 | python3 onnx2tvm.py ${ONNX_DIR}/resnet50-v2-7.onnx ${DEST_DIR}/resnet50-v2-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
22 | python3 onnx2tvm.py ${ONNX_DIR}/squeezenet1.1-7.onnx ${DEST_DIR}/squeezenet1.1-7-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
23 | python3 onnx2tvm.py ${ONNX_DIR}/inception_v3.onnx ${DEST_DIR}/inception_v3-${TYPE}-pack.so "${TYPE2}" 1 3 224 224
24 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/reset_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PREFIX=/bigdisk
 4 | 
 5 | while getopts 'p:' opt; do
 6 |   case "$opt" in
 7 |     p)
 8 |       PREFIX="$OPTARG"
 9 |       ;;
10 | 
11 |     ?|h)
12 |       echo "Usage: $(basename $0) [-p PREFIX]"
13 |       exit 1
14 |       ;;
15 |   esac
16 | done
17 | 
18 | rm -rf ${PREFIX}/src
19 | rm -rf ${PREFIX}/opt
20 | rm -rf ${PREFIX}/results
21 | rm -rf ${PREFIX}/results-triton
22 | rm -rf ${PREFIX}/results-cuda
23 | rm -rf ${PREFIX}/results-mps
24 | rm -rf ${PREFIX}/graphs
25 | rm -rf ${PREFIX}/models/cuda
26 | rm -rf ${PREFIX}/models/cuda_llis
27 | rm -rf ${PREFIX}/models/tensorflow
28 | rm ~/.bash_profile
29 | 
30 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/reset_llis_tvm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PREFIX=/bigdisk
 4 | 
 5 | while getopts 'p:' opt; do
 6 |   case "$opt" in
 7 |     p)
 8 |       PREFIX="$OPTARG"
 9 |       ;;
10 | 
11 |     ?|h)
12 |       echo "Usage: $(basename $0) [-p PREFIX]"
13 |       exit 1
14 |       ;;
15 |   esac
16 | done
17 | 
18 | rm -rf ${PREFIX}/src/llis
19 | rm -rf ${PREFIX}/src/tvm-llis
20 | 
21 | rm -rf ${PREFIX}/opt/llis
22 | rm -rf ${PREFIX}/opt/tvm-llis
23 | 
24 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/triton_docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/tritonserver:23.03-py3
 2 | 
 3 | COPY sosp23_artifact /workspace/sosp23_artifact
 4 | COPY models/cuda /workspace/models/cuda
 5 | COPY models/tensorflow /workspace/models/tensorflow
 6 | COPY tvm-tf /workspace/src/tvm-tf
 7 | 
 8 | RUN /workspace/sosp23_artifact/setup/triton_docker/setup.sh
 9 | 
10 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/triton_docker/build_tvm_tf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | apt update
 4 | apt install -y cmake clang
 5 | 
 6 | cd /workspace/src/tvm-tf
 7 | 
 8 | mkdir build
 9 | cd build
10 | cp ../cmake/config.cmake .
11 | sed -i 's/set(USE_CUDA OFF)/set(USE_CUDA ON)/' config.cmake
12 | sed -i 's/set(USE_LLVM OFF)/set(USE_LLVM ON)/' config.cmake
13 | sed -i 's/set(USE_TF_TVMDSOOP OFF)/set(USE_TF_TVMDSOOP ON)/' config.cmake
14 | cmake ..
15 | make -j40
16 | 
17 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/triton_docker/convert_tvm_to_tf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export TVM_HOME=/workspace/src/tvm-tf
 4 | export PYTHONPATH=${TVM_HOME}/python:${PYTHONPATH}
 5 | export LD_LIBRARY_PATH=${TVM_HOME}/build:$LD_LIBRARY_PATH
 6 | 
 7 | apt update
 8 | apt install -y cmake clang
 9 | 
10 | cd "$(dirname "$0")"
11 | ../dso_to_tf.py /workspace/models/cuda /workspace/models/tensorflow
12 | 
13 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/triton_docker/run_on_tf_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /workspace/sosp23_artifact/setup/triton_docker/build_tvm_tf.sh
4 | /workspace/sosp23_artifact/setup/triton_docker/convert_tvm_to_tf.sh
5 | 
6 | 


--------------------------------------------------------------------------------
/sosp23_artifact/setup/triton_docker/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PREFIX=/workspace
 4 | 
 5 | apt update
 6 | apt install -y clang
 7 | 
 8 | echo "export LD_LIBRARY_PATH=${PREFIX}/src/tvm-tf/build:\$LD_LIBRARY_PATH" | tee -a ~/.bashrc
 9 | 
10 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/densenet-9/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "densenet-9"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 | 		1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {   
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [ 
24 | 		1,
25 |                 1000,
26 |                 1,
27 |                 1
28 |         ]
29 |         label_filename: ""
30 |         is_shape_tensor: false
31 |     }
32 | ]
33 | instance_group: [
34 |     {   
35 |         name: "densenet-9"
36 |         kind: KIND_GPU
37 |         count: 10
38 |         gpus: [ 
39 |                 0
40 |         ]
41 |         profile: []
42 |     }
43 | ]
44 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/googlenet-9/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "googlenet-9"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 |                 1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [
24 |                 1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {
33 |         name: "googlenet-9"
34 |         kind: KIND_GPU
35 |         count: 8
36 |         gpus: [
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/inception_v3/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "inception_v3"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 |                 1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [
24 |                 1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {
33 |         name: "inception_v3"
34 |         kind: KIND_GPU
35 |         count: 2,
36 |         gpus: [
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/mobilenetv2-7/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "mobilenetv2-7"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 |                 1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [
24 |                 1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {
33 |         name: "mobilenetv2-7"
34 |         kind: KIND_GPU
35 |         count: 36,
36 |         gpus: [
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/resnet18-v2-7/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "resnet18-v2-7"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 |                 1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [
24 |                 1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {
33 |         name: "resnet18-v2-7"
34 |         kind: KIND_GPU
35 |         count: 38
36 |         gpus: [
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/resnet34-v2-7/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "resnet34-v2-7"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 |                 1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [
24 |                 1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {
33 |         name: "resnet34-v2-7"
34 |         kind: KIND_GPU
35 |         count: 24
36 |         gpus: [
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/resnet50-v2-7/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "resnet50-v2-7"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 |                 1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [
24 |                 1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {
33 |         name: "resnet50-v2-7"
34 |         kind: KIND_GPU
35 |         count: 10
36 |         gpus: [
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tf_models_config/squeezenet1.1-7/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "squeezenet1.1-7"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 | input: [
 5 |     {
 6 |         name: "input_1"
 7 |         data_type: TYPE_FP32
 8 |         format: FORMAT_NONE
 9 |         dims: [
10 | 		1,
11 |                 3,
12 |                 224,
13 |                 224
14 |         ]
15 |         is_shape_tensor: false,
16 |         allow_ragged_batch: false
17 |     }
18 | ]
19 | output: [
20 |     {   
21 |         name: "output_0"
22 |         data_type: TYPE_FP32
23 |         dims: [ 
24 | 		1,
25 |                 1000
26 |         ]
27 |         label_filename: ""
28 |         is_shape_tensor: false
29 |     }
30 | ]
31 | instance_group: [
32 |     {   
33 |         name: "squeezenet1.1-7"
34 |         kind: KIND_GPU
35 |         count: 13
36 |         gpus: [ 
37 |                 0
38 |         ]
39 |         profile: []
40 |     }
41 | ]
42 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tools/merge_mps_results.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | 
 4 | def print_latency_stats(f, latencies):
 5 |     mean = np.mean(latencies);
 6 | 
 7 |     sd = np.std(latencies, ddof=1)
 8 | 
 9 |     p50 = latencies[int(len(latencies) / 2)];
10 |     p90 = latencies[int(len(latencies) * 0.90)];
11 |     p95 = latencies[int(len(latencies) * 0.95)];
12 |     p99 = latencies[int(len(latencies) * 0.99)];
13 |     max_ele = np.max(latencies)
14 | 
15 |     f.write(',{},{},{},{},{},{},{}'.format(mean, p50, p90, p95, p99, max_ele, sd));
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     
20 |     parser.add_argument('-p', '--prefix', dest='prefix');
21 |     parser.add_argument('-i', '--iat', dest='iat', type=int);
22 |     parser.add_argument('-n', '--num_jobs', dest='num_jobs', type=int);
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     latencies_per_job = []
27 |     aggs = []
28 |     for job_id in range(args.num_jobs):
29 |         latencies_per_job.append(np.sort(np.loadtxt('{}_job{}_iat{}_raw.txt'.format(args.prefix, job_id, args.iat))))
30 |         aggs.append(np.loadtxt('{}_job{}.txt'.format(args.prefix, job_id), delimiter=',', ndmin=2))
31 |     latencies_all = np.sort(np.concatenate(latencies_per_job))
32 | 
33 |     time_elasped = 0
34 |     num_job_instances = 0
35 |     for agg in aggs:
36 |         for row in range(agg.shape[0]):
37 |             if (agg[row, 0] == args.iat):
38 |                 time_elasped = max(time_elasped, agg[row, 2])
39 |                 num_job_instances = int(agg[row, 1])
40 |                 break
41 | 
42 |     with open('{}.txt'.format(args.prefix), 'a') as f:
43 |         f.write('{},{},{}'.format(args.iat, num_job_instances, time_elasped))
44 |         print_latency_stats(f, latencies_all)
45 |         for latencies in latencies_per_job:
46 |             print_latency_stats(f, latencies)
47 |         f.write('\n')
48 | 
49 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tools/parse_input_kelvin.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def parse_input_kelvin(path, xaxis_name, yaxis_names, model_ids):
 4 |     yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4}
 5 | 
 6 |     data = np.genfromtxt(path, delimiter=',')
 7 | 
 8 |     if xaxis_name == 'throughput':
 9 |         x_data = data[:, 1] / data[:, 2] * 1000000
10 |     else: # sending rate
11 |         x_data = 1000000. / data[:, 0]
12 | 
13 |     y_data = [[data[:, model_id * 7 + yaxis_name2offset[yaxis_name] + 3] / 1000. for yaxis_name in yaxis_names] for model_id in model_ids]
14 | 
15 |     return x_data, y_data
16 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tools/plot_latency_fairness_threshold.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import sys
 6 | import argparse
 7 | 
 8 | fmts = ['X-', 'o-', '^-', 's-', 'D-', 'v-', 'p-', '*-', 'H-']
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     
13 |     parser.add_argument('-o', '--output_path', dest='output_path');
14 |     parser.add_argument('-i', '--input_path', dest='input_paths', action='append');
15 |     parser.add_argument('-a', '--algo_name', dest='algo_names', action='append');
16 |     parser.add_argument('-l', '--line', dest='lines', type=int, action='append');
17 |     parser.add_argument('-n', '--name', dest='names', action='append');
18 |     parser.add_argument('--yaxis', dest='yaxises', choices=['Mean', 'p50', 'p90', 'p95', 'p99'], action='append', type=str);
19 |     parser.add_argument('-x', '--xlim', dest='xlim', type=float);
20 |     parser.add_argument('-y', '--ylim', dest='ylim', type=float);
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4}
25 | 
26 |     num_inputs = len(args.input_paths)
27 | 
28 |     data = [np.genfromtxt(path, delimiter=',') for path in args.input_paths]
29 | 
30 |     #plt.rcParams.update({'font.size': 6})
31 |     plt.figure(figsize=(6.4, 3.9552))
32 | 
33 |     for i in range(num_inputs):
34 |         for line, name, fmt in zip(args.lines, args.names, fmts):
35 |             for yaxis in args.yaxises:
36 |                 plt.plot(data[i][:, 0], data[i][:, line * 7 + yaxis_name2offset[yaxis] + 3] / 1000., fmt, label=name, linewidth=1, markersize=2)
37 | 
38 |     plt.gca().invert_xaxis()
39 |     plt.gca().set_xlim(500, 0)
40 |     #plt.xlim(500, 0)
41 |     plt.xlabel('Less Fair  <-     Fairness Threshold     ->  More Fair')
42 |     if len(args.yaxises) == 1:
43 |         plt.ylabel(args.yaxises[0] + ' Latency (ms)')
44 |     else:
45 |         plt.ylabel('Latency (ms)')
46 | 
47 |     plt.legend()
48 |     #if args.xlim is not None:
49 |     #    plt.xlim(0, args.xlim)
50 |     #if args.ylim is not None:
51 |     #    plt.ylim(0, args.ylim)
52 | 
53 |     plt.savefig(args.output_path, bbox_inches='tight')
54 | 
55 | 


--------------------------------------------------------------------------------
/sosp23_artifact/triton_server_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo docker run -it --gpus=1 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p8000:8000 -p8001:8001 -p8002:8002 triton_server_tvm bash -c 'LD_LIBRARY_PATH=/workspace/src/tvm-tf/build:$LD_LIBRARY_PATH LD_PRELOAD="/workspace/src/tvm-tf/build/libtvm_dso_op.so /opt/tritonserver/backends/tensorflow2/libtensorflow_cc.so /opt/tritonserver/backends/tensorflow2/libtensorflow_framework.so" CUDA_DEVICE_MAX_CONNECTIONS=32 tritonserver --model-repository=/workspace/models/tensorflow --backend-config=tensorflow,version=2 --min-supported-compute-capability=6.0 --allow-grpc=true --backend-config=default-max-batch-size=0'
4 | 
5 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/densenet-9-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000 1 1
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/googlenet-9-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/inception_v3-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/mnist-8-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 1 28 28
2 | -1
3 | 1 10
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/mobilenetv2-7-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/resnet18-v2-7-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/resnet34-v2-7-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/resnet50-v2-7-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/sosp23_artifact/tvm_models_dim/squeezenet1.1-7-cuda-pack.so.dim:
--------------------------------------------------------------------------------
1 | 1 3 224 224
2 | -1
3 | 1 1000
4 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(client)
2 | add_subdirectory(server)
3 | add_subdirectory(ipc)
4 | add_subdirectory(job)
5 | 


--------------------------------------------------------------------------------
/src/client/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(client OBJECT client.cpp job_ref.cpp job_instance_ref.cpp profiler_client.cpp)
2 | target_link_libraries(client spdlog::spdlog)
3 | 


--------------------------------------------------------------------------------
/src/client/job_instance_ref.cpp:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <llis/job/job.h>
 3 | #include <llis/client/job_ref.h>
 4 | #include <llis/client/job_instance_ref.h>
 5 | #include <llis/client/client.h>
 6 | 
 7 | #include <unistd.h>
 8 | 
 9 | #include <cmath>
10 | 
11 | namespace llis {
12 | namespace client {
13 | 
14 | JobInstanceRef::JobInstanceRef(JobRef* job_ref, IoShmEntry io_shm_entry) : job_ref_(job_ref), io_shm_entry_(io_shm_entry) {
15 |     c2s_channel_ = job_ref_->get_c2s_channel();
16 | }
17 | 
18 | JobInstanceRef::~JobInstanceRef() {
19 |     // TODO
20 |     //release();
21 | }
22 | 
23 | void* JobInstanceRef::get_input_ptr() {
24 |     return io_shm_entry_.ptr;
25 | }
26 | 
27 | void* JobInstanceRef::get_output_ptr() {
28 |     return reinterpret_cast<char*>(io_shm_entry_.ptr) + job_ref_->get_job()->get_input_size();
29 | }
30 | 
31 | void JobInstanceRef::launch() {
32 |     c2s_channel_->acquire_writer_lock();
33 | 
34 |     c2s_channel_->write(MsgType::LAUNCH_JOB);
35 | #ifdef PRINT_LAUNCH_JOB_IPC_LATENCY
36 |     unsigned long long cur_time = std::chrono::steady_clock::now().time_since_epoch().count();
37 |     c2s_channel_->write(cur_time);
38 | #endif
39 |     c2s_channel_->write(job_ref_->get_job_ref_id());
40 |     c2s_channel_->write(io_shm_entry_.id);
41 |     c2s_channel_->write(io_shm_entry_.offset);
42 |     c2s_channel_->write(this);
43 | 
44 |     c2s_channel_->release_writer_lock();
45 | }
46 | 
47 | void JobInstanceRef::release() {
48 |     job_ref_->release_io_shm_entry(io_shm_entry_);
49 | }
50 | 
51 | void JobInstanceRef::set_id(JobInstanceRefId id) {
52 |     id_ = id;
53 | }
54 | 
55 | JobInstanceRefId JobInstanceRef::get_id() const {
56 |     return id_;
57 | }
58 | 
59 | JobRefId JobInstanceRef::get_job_ref_id() const {
60 |     return job_ref_->get_job_ref_id();
61 | }
62 | 
63 | void JobInstanceRef::set_start_time(double time_point) {
64 |     start_time_ = time_point;
65 | }
66 | 
67 | double JobInstanceRef::get_start_time() const {
68 |     return start_time_;
69 | }
70 | 
71 | }
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/src/ipc/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(ipc OBJECT shm_channel.cu shm_primitive_channel.cu name_format.cpp unix_datagram_socket.cpp)
2 | set_target_properties(ipc PROPERTIES POSITION_INDEPENDENT_CODE ON)
3 | 
4 | add_library(ipc-gpu OBJECT shm_channel.cu shm_primitive_channel.cu name_format.cpp unix_datagram_socket.cpp)
5 | set_target_properties(ipc-gpu PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON)
6 | 


--------------------------------------------------------------------------------
/src/ipc/name_format.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/name_format.h>
 2 | #include <llis/ipc/defs.h>
 3 | 
 4 | namespace llis {
 5 | namespace ipc {
 6 | 
 7 | std::string s2c_socket_name(const std::string& server_name, ClientId client_id) {
 8 |     return "llis-socket-s2c-" + server_name + "-" + std::to_string(client_id);
 9 | }
10 | 
11 | std::string s2c_channel_name(const std::string& server_name, ClientId client_id) {
12 |     return "s2c:" + server_name + ":" + std::to_string(client_id);
13 | }
14 | 
15 | std::string c2s_channel_name(const std::string& server_name) {
16 |     return "c2s:" + server_name;
17 | }
18 | 
19 | }
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/src/ipc/unix_datagram_socket.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/unix_datagram_socket.h>
 2 | 
 3 | #include <llis/utils/error.h>
 4 | 
 5 | #include <sys/socket.h>
 6 | #include <sys/un.h>
 7 | #include <unistd.h>
 8 | 
 9 | namespace llis {
10 | namespace ipc {
11 | 
12 | UnixDatagramSocket::UnixDatagramSocket() {
13 |     socket_ = socket(AF_UNIX, SOCK_DGRAM, 0);
14 |     utils::error_throw_posix(socket_);
15 |     is_owner_ = true;
16 | }
17 | 
18 | UnixDatagramSocket::UnixDatagramSocket(const std::string& name) {
19 |     socket_ = socket(AF_UNIX, SOCK_DGRAM, 0);
20 |     utils::error_throw_posix(socket_);
21 |     is_owner_ = true;
22 | 
23 |     bind(name);
24 | }
25 | 
26 | UnixDatagramSocket::UnixDatagramSocket(int socket) : socket_(socket), is_owner_(false) {}
27 | 
28 | UnixDatagramSocket::UnixDatagramSocket(UnixDatagramSocket&& rhs) {
29 |     *this = std::move(rhs);
30 | }
31 | 
32 | UnixDatagramSocket& UnixDatagramSocket::operator=(UnixDatagramSocket&& rhs) {
33 |     socket_ = rhs.socket_;
34 |     is_owner_ = rhs.is_owner_;
35 |     remote_addr_ = rhs.remote_addr_;
36 | 
37 |     rhs.socket_ = -1;
38 |     rhs.is_owner_ = false;
39 |     
40 |     return *this;
41 | }
42 | 
43 | UnixDatagramSocket::~UnixDatagramSocket() {
44 |     if (is_owner_) {
45 |         utils::warn_log_posix(close(socket_));
46 | 
47 |         is_owner_ = false;
48 |     }
49 | 
50 |     socket_ = -1;
51 | }
52 | 
53 | void UnixDatagramSocket::bind(const std::string& name) {
54 |     sockaddr_un addr;
55 |     bzero(&addr, sizeof(addr));
56 |     addr.sun_family = AF_UNIX;
57 |     // TODO: check length. It should be < 108 bytes
58 |     strncpy(addr.sun_path + 1, name.c_str(), 107);
59 | 
60 |     utils::error_throw_posix(::bind(socket_, reinterpret_cast<const sockaddr*>(&addr), sizeof(addr)));
61 | }
62 | 
63 | UnixDatagramSocket UnixDatagramSocket::connect(const std::string& name) {
64 |     UnixDatagramSocket res(socket_);
65 | 
66 |     bzero(&res.remote_addr_, sizeof(res.remote_addr_));
67 |     res.remote_addr_.sun_family = AF_UNIX;
68 |     // TODO: check length. It should be < 108 bytes
69 |     strncpy(res.remote_addr_.sun_path + 1, name.c_str(), 107);
70 | 
71 |     return res;
72 | }
73 | 
74 | ssize_t UnixDatagramSocket::write(const void* buf, size_t count) {
75 |     ssize_t bytes_sent = sendto(socket_, buf, count, 0, reinterpret_cast<const sockaddr*>(&remote_addr_), sizeof(remote_addr_));
76 |     utils::error_throw_posix(bytes_sent);
77 |     return bytes_sent;
78 | }
79 | 
80 | ssize_t UnixDatagramSocket::read(void* buf, size_t count) {
81 |     ssize_t bytes_read = ::read(socket_, buf, count);
82 |     utils::error_throw_posix(bytes_read);
83 |     return bytes_read;
84 | }
85 | 
86 | }
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/job/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(llis_job SHARED finished_block_notifier.cu $<TARGET_OBJECTS:ipc>)
 2 | set_target_properties(llis_job PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON)
 3 | target_link_libraries(llis_job "-Wl,--no-as-needed" Boost::context "-Wl,--as-needed" spdlog::spdlog)
 4 | install(TARGETS llis_job DESTINATION lib)
 5 | 
 6 | add_library(llis_job_gpu SHARED finished_block_notifier.cu utils.cu $<TARGET_OBJECTS:ipc-gpu>)
 7 | set_target_properties(llis_job_gpu PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON)
 8 | target_link_libraries(llis_job_gpu Boost::context spdlog::spdlog)
 9 | install(TARGETS llis_job_gpu DESTINATION lib)
10 | 
11 | add_library(llis_context SHARED context.cpp)
12 | target_link_libraries(llis_context llis_job spdlog::spdlog)
13 | install(TARGETS llis_context DESTINATION lib)
14 | 


--------------------------------------------------------------------------------
/src/job/context.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/ipc/shm_channel.h>
 3 | #include <llis/job/context.h>
 4 | 
 5 | namespace llis {
 6 | namespace job {
 7 | 
 8 | Job* Context::current_job_;
 9 | ipc::Gpu2SchedChannel Context::gpu2sched_channel_;
10 | #ifdef LLIS_MEASURE_BLOCK_TIME
11 | ipc::Gpu2SchedChannel Context::gpu2sched_block_time_channel_;
12 | #endif
13 | ipc::ShmChannelCpuWriter Context::mem2sched_channel_;
14 | 
15 | }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/src/job/finished_block_notifier.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/finished_block_notifier.h>
 3 | #include <llis/utils/error.h>
 4 | 
 5 | #include <vector>
 6 | 
 7 | namespace llis {
 8 | namespace job {
 9 | 
10 | FinishedBlockNotifier::FinishedBlockNotifier(ipc::Gpu2SchedChannel* gpu2sched_channel
11 | #ifdef LLIS_MEASURE_BLOCK_TIME
12 |         , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel
13 | #endif
14 | ) {
15 |     gpu2sched_channel_ = gpu2sched_channel->fork();
16 | #ifdef LLIS_MEASURE_BLOCK_TIME
17 |     gpu2sched_block_time_channel_ = gpu2sched_block_time_channel->fork();
18 | #endif
19 | }
20 | 
21 | FinishedBlockNotifier* FinishedBlockNotifier::create_array(unsigned num, ipc::Gpu2SchedChannel* gpu2sched_channel
22 | #ifdef LLIS_MEASURE_BLOCK_TIME
23 |         , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel
24 | #endif
25 | ) {
26 |     FinishedBlockNotifier* res;
27 |     utils::error_throw_cuda(cudaMalloc((void**)&res, num * sizeof(FinishedBlockNotifier)));
28 | 
29 |     std::vector<FinishedBlockNotifier> tmp;
30 |     tmp.reserve(num);
31 |     for (unsigned i = 0; i < num; ++i) {
32 |         tmp.emplace_back(gpu2sched_channel
33 | #ifdef LLIS_MEASURE_BLOCK_TIME
34 |                 , gpu2sched_block_time_channel
35 | #endif
36 |             );
37 |     }
38 | 
39 |     utils::error_throw_cuda(cudaMemcpy(res, tmp.data(), num * sizeof(FinishedBlockNotifier), cudaMemcpyHostToDevice));
40 | 
41 |     return res;
42 | }
43 | 
44 | void FinishedBlockNotifier::free_array(FinishedBlockNotifier* ptr) {
45 |     cudaFree(ptr);
46 | }
47 | 
48 | }
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/src/job/utils.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/instrument.h>
 3 | #include <llis/job/job.h>
 4 | #include <llis/ipc/shm_channel.h>
 5 | 
 6 | #include <cmath>
 7 | 
 8 | namespace llis {
 9 | namespace job {
10 | 
11 | namespace {
12 | 
13 | __global__ void memset_impl(void* ptr, int val, size_t count, JobId job_id, ipc::Gpu2SchedChannel gpu2sched_channel
14 | #ifdef LLIS_MEASURE_BLOCK_TIME
15 |         , ipc::Gpu2SchedChannel gpu2sched_block_time_channel
16 | #endif
17 | ) {
18 | #ifdef LLIS_MEASURE_BLOCK_TIME
19 |     BlockStartEndTime start_end_time;
20 |     kernel_start(job_id, &gpu2sched_channel, &start_end_time);
21 | #else
22 |     kernel_start(job_id, &gpu2sched_channel);
23 | #endif
24 | 
25 |     int id = blockIdx.x * blockDim.x + threadIdx.x;
26 | 
27 |     if (id < count) {
28 |         (reinterpret_cast<char*>(ptr))[id] = val;
29 |     }
30 | 
31 | #ifdef LLIS_MEASURE_BLOCK_TIME
32 |     kernel_end(job_id, &gpu2sched_channel, &gpu2sched_block_time_channel, &start_end_time);
33 | #else
34 |     kernel_end(job_id, &gpu2sched_channel);
35 | #endif
36 | }
37 | 
38 | }
39 | 
40 | void memset_res(size_t count, Job* job) {
41 |     constexpr int num_threads_per_block = 256;
42 | 
43 |     job->set_num_blocks(std::ceil((float)count / (float)num_threads_per_block));
44 |     job->set_num_threads_per_block(num_threads_per_block);
45 |     job->set_num_registers_per_thread(32);
46 |     job->set_smem_size_per_block(0);
47 | }
48 | 
49 | void memset(void* ptr, int val, size_t count, Job* job, ipc::Gpu2SchedChannel* gpu2sched_channel
50 | #ifdef LLIS_MEASURE_BLOCK_TIME
51 |         , ipc::Gpu2SchedChannel* gpu2sched_block_time_channel
52 | #endif
53 | ) {
54 |     constexpr int num_threads_per_block = 256;
55 | 
56 | #ifdef LLIS_MEASURE_BLOCK_TIME
57 |     memset_impl<<<job->get_num_blocks(), num_threads_per_block, job->get_smem_size_per_block(), job->get_cuda_stream()>>>(ptr, val, count, job->get_id(), gpu2sched_channel->fork(), gpu2sched_block_time_channel->fork());
58 | #else
59 |     memset_impl<<<job->get_num_blocks(), num_threads_per_block, job->get_smem_size_per_block(), job->get_cuda_stream()>>>(ptr, val, count, job->get_id(), gpu2sched_channel->fork());
60 | #endif
61 | }
62 | 
63 | }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/server/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(llis_server server.cpp scheduler.cpp scheduler_fifo.cpp scheduler_fifo2.cpp scheduler_full3.cpp client_connection.cpp registered_job.cpp gpu_resources.cpp sm_resources.cpp profiler.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
2 | target_link_libraries(llis_server llis_job llis_context spdlog::spdlog Boost::program_options rt dl)
3 | install(TARGETS llis_server DESTINATION bin)
4 | 


--------------------------------------------------------------------------------
/src/server/client_connection.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eniac/paella/e2ed7b53272eb393e1361b0a87ceb8974efc237d/src/server/client_connection.cpp


--------------------------------------------------------------------------------
/src/server/scheduler.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/server/scheduler.h>
 2 | 
 3 | namespace llis {
 4 | namespace server {
 5 | 
 6 | std::unordered_map<std::string, SchedulerFactory::RegisterFunc> SchedulerFactory::registered_schedulers_; 
 7 | 
 8 | bool SchedulerFactory::register_scheduler(std::string name, RegisterFunc func) {
 9 |     if (registered_schedulers_.find(name) == registered_schedulers_.end()) {
10 |         registered_schedulers_.emplace(name, func);
11 |         return true;
12 |     } else {
13 |         return false;
14 |     }
15 | }
16 | 
17 | std::unique_ptr<Scheduler> SchedulerFactory::create(std::string name, const po::variables_map& args) {
18 |     auto it = registered_schedulers_.find(name);
19 |     if (it == registered_schedulers_.end()) {
20 |         return nullptr;
21 |     } else {
22 |         return it->second(args);
23 |     }
24 | }
25 | 
26 | }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(ipc)
2 | add_subdirectory(client)
3 | add_subdirectory(simple)
4 | add_subdirectory(utils)
5 | 


--------------------------------------------------------------------------------
/tests/client/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(test_client client.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
 2 | target_link_libraries(test_client spdlog::spdlog dl rt)
 3 | 
 4 | add_executable(test_client_concurrent_runs client_concurrent_runs.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
 5 | target_link_libraries(test_client_concurrent_runs spdlog::spdlog dl rt)
 6 | 
 7 | add_executable(test_client_single_latency client_single_latency.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
 8 | target_link_libraries(test_client_single_latency spdlog::spdlog dl rt)
 9 | 
10 | add_executable(test_client_concurrent_run_latencies client_concurrent_run_latencies.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
11 | target_link_libraries(test_client_concurrent_run_latencies spdlog::spdlog dl rt)
12 | 
13 | add_executable(test_client_concurrent_run_latencies_set_load client_concurrent_run_latencies_set_load.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
14 | target_link_libraries(test_client_concurrent_run_latencies_set_load spdlog::spdlog dl rt)
15 | 
16 | add_executable(test_client_concurrent_run_latencies_set_load_multi client_concurrent_run_latencies_set_load_multi.cpp $<TARGET_OBJECTS:client> $<TARGET_OBJECTS:ipc>)
17 | target_link_libraries(test_client_concurrent_run_latencies_set_load_multi spdlog::spdlog dl rt)
18 | 
19 | add_executable(test_raw_kernel_launch raw_kernel_launch.cu $<TARGET_OBJECTS:ipc-gpu>)
20 | target_link_libraries(test_raw_kernel_launch llis_job dl rt)
21 | set_target_properties(test_raw_kernel_launch PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
22 | 


--------------------------------------------------------------------------------
/tests/client/client.cpp:
--------------------------------------------------------------------------------
 1 | #include "llis/client/job_instance_ref.h"
 2 | #include <llis/client/client.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |     const char* server_name = argv[1];
 6 |     const char* job_path = argv[2];
 7 | 
 8 |     llis::client::Client client(server_name);
 9 |     llis::client::JobRef job_ref = client.register_job(job_path);
10 |     llis::client::JobInstanceRef* job_instance_ref = job_ref.create_instance();
11 |     job_instance_ref->launch();
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/client/client_concurrent_runs.cpp:
--------------------------------------------------------------------------------
 1 | #include "llis/client/job_instance_ref.h"
 2 | #include <llis/client/client.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |     const char* server_name = argv[1];
 6 |     const char* job_path = argv[2];
 7 |     int num = atoi(argv[3]);
 8 | 
 9 |     llis::client::Client client(server_name);
10 |     llis::client::JobRef job_ref = client.register_job(job_path);
11 | 
12 |     std::vector<llis::client::JobInstanceRef*> job_instance_refs;
13 |     job_instance_refs.reserve(num);
14 | 
15 |     for (int i = 0; i < num; ++i) {
16 |         job_instance_refs.push_back(job_ref.create_instance());
17 |         job_instance_refs.back()->launch();
18 |     }
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/tests/client/client_single_latency.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/client/job_instance_ref.h>
 2 | #include <llis/client/client.h>
 3 | 
 4 | #include <chrono>
 5 | #include <iostream>
 6 | 
 7 | int main(int argc, char** argv) {
 8 |     const char* server_name = argv[1];
 9 |     const char* job_path = argv[2];
10 |     int num = atoi(argv[3]);
11 |     const char* profile_path = nullptr;
12 |     if (argc >= 5) {
13 |         profile_path = argv[4];
14 |     }
15 | 
16 |     llis::client::Client client(server_name);
17 |     llis::client::JobRef job_ref = client.register_job(job_path);
18 |     llis::client::JobInstanceRef* job_instance_ref = job_ref.create_instance();
19 | 
20 |     client.get_profiler_client()->set_record_kernel_info();
21 | 
22 |     for (int i = 0; i < num; ++i) {
23 |         auto start_time = std::chrono::steady_clock::now();
24 |         job_instance_ref->launch();
25 |         client.wait();
26 |         auto end_time = std::chrono::steady_clock::now();
27 | 
28 |         auto time_taken = end_time - start_time;
29 | 
30 |         std::cout << std::chrono::duration<double, std::micro>(time_taken).count() << std::endl;
31 |     }
32 | 
33 |     client.get_profiler_client()->unset_record_kernel_info();
34 |     if (profile_path != nullptr) {
35 |         client.get_profiler_client()->save(profile_path);
36 |     }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/client/raw_kernel_launch.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/instrument.h>
 3 | #include <llis/job/finished_block_notifier.h>
 4 | #include <llis/ipc/defs.h>
 5 | 
 6 | #include <chrono>
 7 | #include <iostream>
 8 | 
 9 | __global__ void helloworld(int i, llis::JobId job_id, llis::job::FinishedBlockNotifier* notifier) {
10 |     //notifier->start(job_id);
11 |     //notifier->end(job_id);
12 | }
13 | 
14 | int main(int argc, char** argv) {
15 |     int num_blocks = atoi(argv[1]);
16 |     int num_iters = atoi(argv[2]);
17 | 
18 |     cudaStream_t stream;
19 |     cudaStreamCreate(&stream);
20 | 
21 |     llis::ipc::Gpu2SchedChannel gpu2sched_channel(1024);
22 |     llis::ipc::Gpu2SchedChannel gpu2sched_block_time_channel(1024);
23 | 
24 |     llis::job::FinishedBlockNotifier* finished_block_notifier = llis::job::FinishedBlockNotifier::create_array(1, &gpu2sched_channel
25 | #ifdef LLIS_MEASURE_BLOCK_TIME
26 |         , &gpu2sched_block_time_channel
27 | #endif
28 |     );
29 | 
30 |     for (int i = 0; i < num_iters; ++i) {
31 |         auto start_time = std::chrono::steady_clock::now();
32 | 
33 |         helloworld<<<num_blocks, 1, 0, stream>>>(i, 0, finished_block_notifier);
34 |         cudaStreamSynchronize(stream);
35 | 
36 |         auto end_time = std::chrono::steady_clock::now();
37 | 
38 |         auto time_taken = end_time - start_time;
39 |         std::cout << std::chrono::duration<double, std::micro>(time_taken).count() << std::endl;
40 |     }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/tests/ipc/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(shm_channel)
2 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(latency)
 2 | 
 3 | add_executable(shmc_read shmc_read.cpp $<TARGET_OBJECTS:ipc>)
 4 | target_link_libraries(shmc_read rt)
 5 | add_executable(shmc_write shmc_write.cpp $<TARGET_OBJECTS:ipc>)
 6 | target_link_libraries(shmc_write rt)
 7 | add_executable(shmc_read_write shmc_read_write.cpp $<TARGET_OBJECTS:ipc>)
 8 | target_link_libraries(shmc_read_write rt)
 9 | add_executable(shmc_read_write_same_proc shmc_read_write_same_proc.cpp $<TARGET_OBJECTS:ipc>)
10 | target_link_libraries(shmc_read_write_same_proc rt pthread)
11 | add_executable(shmc_read_write_cpu_gpu shmc_read_write_cpu_gpu.cu $<TARGET_OBJECTS:ipc-gpu>)
12 | set_target_properties(shmc_read_write_cpu_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
13 | target_link_libraries(shmc_read_write_cpu_gpu rt pthread)
14 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(shmc_latency_read shmc_latency_read.cpp $<TARGET_OBJECTS:ipc>)
 2 | target_link_libraries(shmc_latency_read rt)
 3 | add_executable(shmc_latency_write shmc_latency_write.cpp $<TARGET_OBJECTS:ipc>)
 4 | target_link_libraries(shmc_latency_write rt)
 5 | add_executable(shmc_latency_read_bare_atomic shmc_latency_read_bare_atomic.cpp $<TARGET_OBJECTS:ipc>)
 6 | target_link_libraries(shmc_latency_read_bare_atomic rt)
 7 | add_executable(shmc_latency_write_bare_atomic shmc_latency_write_bare_atomic.cpp $<TARGET_OBJECTS:ipc>)
 8 | target_link_libraries(shmc_latency_write_bare_atomic rt)
 9 | add_executable(shmc_latency_read_bare_atomic_loop shmc_latency_read_bare_atomic_loop.cpp $<TARGET_OBJECTS:ipc>)
10 | target_link_libraries(shmc_latency_read_bare_atomic_loop rt)
11 | add_executable(shmc_latency_write_bare_atomic_loop shmc_latency_write_bare_atomic_loop.cpp $<TARGET_OBJECTS:ipc>)
12 | target_link_libraries(shmc_latency_write_bare_atomic_loop rt)
13 | add_executable(shmc_latency_read_loop shmc_latency_read_loop.cpp $<TARGET_OBJECTS:ipc>)
14 | target_link_libraries(shmc_latency_read_loop rt)
15 | add_executable(shmc_latency_write_loop shmc_latency_write_loop.cpp $<TARGET_OBJECTS:ipc>)
16 | target_link_libraries(shmc_latency_write_loop rt)
17 | add_executable(shmpc_latency_gpu shmpc_latency_gpu.cu)
18 | target_link_libraries(shmpc_latency_gpu llis_job rt)
19 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_read.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | #include <iostream>
 4 | #include <chrono>
 5 | 
 6 | int main() {
 7 |     int val;
 8 | 
 9 |     // The channel has a size of sizeof(val) + 1
10 |     // This makes sure that the writer can only write after the reader has read
11 |     // +1 because the writer always wastes one byte
12 |     llis::ipc::ShmChannelCpuReader channel("test", sizeof(val) + 1);
13 | 
14 |     // The first read is a barrier to make sure that both sides are in the same stage
15 |     channel.read(&val, sizeof(val));
16 | 
17 |     channel.read(&val, sizeof(val));
18 | 
19 |     std::cout << "Current time since epoch: " << std::chrono::system_clock::now().time_since_epoch().count() << std::endl;
20 | 
21 |     std::cout << "Value: " << val << std::endl;
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_read_bare_atomic.cpp:
--------------------------------------------------------------------------------
 1 | #include <atomic>
 2 | #include <string>
 3 | #include <iostream>
 4 | #include <chrono>
 5 | 
 6 | #include <sys/mman.h>
 7 | #include <sys/stat.h>
 8 | #include <fcntl.h>
 9 | #include <unistd.h>
10 | 
11 | int main() {
12 |     std::string name_with_prefix = "ml-on-apu:test";
13 |     int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600);
14 |     ftruncate(fd_, sizeof(std::atomic_bool));
15 |     std::atomic_char* shm_ = reinterpret_cast<std::atomic_char*>(mmap(nullptr, sizeof(std::atomic_char), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
16 | 
17 |     int a = shm_->load(std::memory_order_acquire);
18 |     int b = shm_->load(std::memory_order_acquire);
19 | 
20 |     while (shm_->load(std::memory_order_acquire) != 3) {}
21 | 
22 |     //while (shm_->load(std::memory_order_acquire)) {}
23 |     auto cur_time = std::chrono::system_clock::now().time_since_epoch().count();
24 |     std::cout << "Current time since epoch: " << cur_time << std::endl;
25 |     std::cout << "a: " << a << std::endl;
26 |     std::cout << "b: " << b << std::endl;
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_read_bare_atomic_loop.cpp:
--------------------------------------------------------------------------------
 1 | #define NUM_ITERS 1000000
 2 | 
 3 | #include <atomic>
 4 | #include <string>
 5 | #include <iostream>
 6 | #include <chrono>
 7 | 
 8 | #include <sys/mman.h>
 9 | #include <sys/stat.h>
10 | #include <fcntl.h>
11 | #include <unistd.h>
12 | 
13 | int main() {
14 |     std::string name_with_prefix = "ml-on-apu:test";
15 |     int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600);
16 |     ftruncate(fd_, sizeof(std::atomic_bool));
17 |     std::atomic_int* shm_ = reinterpret_cast<std::atomic_int*>(mmap(nullptr, sizeof(std::atomic_int), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
18 | 
19 |     int a = shm_->load(std::memory_order_acquire);
20 |     int b = shm_->load(std::memory_order_acquire);
21 | 
22 |     auto time1 = std::chrono::system_clock::now().time_since_epoch().count();
23 | 
24 |     for (int i = 1; i < NUM_ITERS; ++i) {
25 |         while (shm_->load(std::memory_order_acquire) != i) {}
26 |         shm_->store(0, std::memory_order_release);
27 |     }
28 |     while (shm_->load(std::memory_order_acquire) != NUM_ITERS) {}
29 | 
30 |     auto time2 = std::chrono::system_clock::now().time_since_epoch().count();
31 | 
32 |     std::cout << "time1: " << time1 << std::endl;
33 |     std::cout << "time2: " << time2 << std::endl;
34 |     std::cout << "a: " << a << std::endl;
35 |     std::cout << "b: " << b << std::endl;
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_read_loop.cpp:
--------------------------------------------------------------------------------
 1 | #define CHANNEL_SIZE 64
 2 | #define NUM_ITERS 1000000
 3 | 
 4 | #include <llis/ipc/shm_channel.h>
 5 | 
 6 | #include <iostream>
 7 | #include <chrono>
 8 | 
 9 | int main() {
10 |     int val = 0;
11 | 
12 |     llis::ipc::ShmChannelCpuReader read_channel("test_read", 64);
13 |     llis::ipc::ShmChannelCpuWriter write_channel("test_write", 64);
14 | 
15 |     // Warm up
16 |     write_channel.write(&val, sizeof(val));
17 |     write_channel.write(&val, sizeof(val));
18 |     read_channel.read(&val, sizeof(val));
19 |     read_channel.read(&val, sizeof(val));
20 | 
21 |     auto time1 = std::chrono::system_clock::now().time_since_epoch().count();
22 | 
23 |     for (int i = 1; i < NUM_ITERS; ++i) {
24 |         read_channel.read(&val, sizeof(val));
25 |         write_channel.write(&i, sizeof(i));
26 |     }
27 |     read_channel.read(&val, sizeof(val));
28 | 
29 |     auto time2 = std::chrono::system_clock::now().time_since_epoch().count();
30 | 
31 |     std::cout << "time1: " << time1 << std::endl;
32 |     std::cout << "time2: " << time2 << std::endl;
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_write.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | #include <iostream>
 4 | #include <chrono>
 5 | 
 6 | int main() {
 7 |     int val = 1234;
 8 |     int val2 = 5678;
 9 | 
10 |     llis::ipc::ShmChannelCpuWriter channel("test", sizeof(val) + 1);
11 | 
12 |     channel.write(&val2, sizeof(val2));
13 | 
14 | 
15 |     channel.write(&val, sizeof(val));
16 |     std::cout << "Current time since epoch: " << std::chrono::system_clock::now().time_since_epoch().count() << std::endl;
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_write_bare_atomic.cpp:
--------------------------------------------------------------------------------
 1 | #include <atomic>
 2 | #include <string>
 3 | #include <iostream>
 4 | #include <chrono>
 5 | 
 6 | #include <sys/mman.h>
 7 | #include <sys/stat.h>
 8 | #include <fcntl.h>
 9 | #include <unistd.h>
10 | 
11 | int main() {
12 |     std::string name_with_prefix = "ml-on-apu:test";
13 |     int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600);
14 |     ftruncate(fd_, sizeof(std::atomic_bool));
15 |     std::atomic_char* shm_ = reinterpret_cast<std::atomic_char*>(mmap(nullptr, sizeof(std::atomic_char), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
16 | 
17 |     shm_->store(1, std::memory_order_release);
18 |     shm_->store(2, std::memory_order_release);
19 | 
20 |     auto time1 = std::chrono::system_clock::now().time_since_epoch().count();
21 |     shm_->store(3, std::memory_order_release);
22 | 
23 |     auto time3 = std::chrono::system_clock::now().time_since_epoch().count();
24 | 
25 |     std::cout << "time1: " << time1 << std::endl;
26 |     //std::cout << "time2: " << time2 << std::endl;
27 |     std::cout << "time3: " << time3 << std::endl;
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_write_bare_atomic_loop.cpp:
--------------------------------------------------------------------------------
 1 | #define NUM_ITERS 1000000
 2 | 
 3 | #include <atomic>
 4 | #include <string>
 5 | #include <iostream>
 6 | #include <chrono>
 7 | 
 8 | #include <sys/mman.h>
 9 | #include <sys/stat.h>
10 | #include <fcntl.h>
11 | #include <unistd.h>
12 | 
13 | int main() {
14 |     std::string name_with_prefix = "ml-on-apu:test";
15 |     int fd_ = shm_open(name_with_prefix.c_str(), O_CREAT | O_RDWR, 0600);
16 |     ftruncate(fd_, sizeof(std::atomic_bool));
17 |     std::atomic_int* shm_ = reinterpret_cast<std::atomic_int*>(mmap(nullptr, sizeof(std::atomic_int), PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
18 | 
19 |     shm_->store(0, std::memory_order_release);
20 |     shm_->store(2, std::memory_order_release);
21 | 
22 |     auto time1 = std::chrono::system_clock::now().time_since_epoch().count();
23 | 
24 |     for (int i = 1; i < NUM_ITERS; ++i) {
25 |         shm_->store(i, std::memory_order_release);
26 |         while (shm_->load(std::memory_order_acquire) != 0) {}
27 |     }
28 |     shm_->store(NUM_ITERS, std::memory_order_release);
29 | 
30 |     auto time2 = std::chrono::system_clock::now().time_since_epoch().count();
31 | 
32 |     std::cout << "time1: " << time1 << std::endl;
33 |     std::cout << "time2: " << time2 << std::endl;
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmc_latency_write_loop.cpp:
--------------------------------------------------------------------------------
 1 | #define CHANNEL_SIZE 64
 2 | #define NUM_ITERS 1000000
 3 | 
 4 | #include <llis/ipc/shm_channel.h>
 5 | 
 6 | #include <iostream>
 7 | #include <chrono>
 8 | 
 9 | int main() {
10 |     int val = 0;
11 | 
12 |     llis::ipc::ShmChannelCpuWriter write_channel("test_write", 64);
13 |     llis::ipc::ShmChannelCpuReader read_channel("test_read", 64);
14 | 
15 |     // Warm up
16 |     write_channel.write(&val, sizeof(val));
17 |     write_channel.write(&val, sizeof(val));
18 |     read_channel.read(&val, sizeof(val));
19 |     read_channel.read(&val, sizeof(val));
20 | 
21 |     auto time1 = std::chrono::system_clock::now().time_since_epoch().count();
22 | 
23 |     for (int i = 1; i < NUM_ITERS; ++i) {
24 |         write_channel.write(&i, sizeof(i));
25 |         read_channel.read(&val, sizeof(val));
26 |     }
27 |     val = NUM_ITERS;
28 |     write_channel.write(&val, sizeof(val));
29 | 
30 |     auto time2 = std::chrono::system_clock::now().time_since_epoch().count();
31 | 
32 |     std::cout << "time1: " << time1 << std::endl;
33 |     std::cout << "time2: " << time2 << std::endl;
34 | }
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/latency/shmpc_latency_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_primitive_channel.h>
 2 | #include <llis/job/instrument_info.h>
 3 | 
 4 | #include <iostream>
 5 | #include <chrono>
 6 | 
 7 | __global__ void kernel(float* output, float* input, unsigned count, llis::ipc::ShmPrimitiveChannelGpu<uint64_t> channel) {
 8 | //__global__ void kernel(llis::ipc::ShmPrimitiveChannelGpu<uint64_t> channel) {
 9 |     if (threadIdx.x == 0) {
10 |         llis::job::InstrumentInfo info;
11 |         info.is_start = true;
12 |         info.job_id = blockIdx.x;
13 |         channel.write(info);
14 |     }
15 | 
16 |     unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
17 |     unsigned grid_size = blockDim.x * gridDim.x;
18 | 
19 |     while (id < count) {
20 |         output[id] += input[id];
21 |         id += grid_size;
22 |     }
23 | 
24 |     __syncthreads();
25 | 
26 |     if (threadIdx.x == 0) {
27 |         llis::job::InstrumentInfo info;
28 |         info.is_start = false;
29 |         info.job_id = blockIdx.x;
30 |         channel.write(info);
31 |     }
32 | }
33 | 
34 | int main(int argc, char** argv) {
35 |     unsigned num_blocks = atoi(argv[1]);
36 |     unsigned vec_len = atoi(argv[2]);
37 |     unsigned num_iters = atoi(argv[3]);
38 | 
39 |     float* x;
40 |     float* y;
41 |     cudaMalloc(&x, sizeof(*x) * vec_len);
42 |     cudaMalloc(&y, sizeof(*y) * vec_len);
43 | 
44 |     llis::ipc::ShmPrimitiveChannelGpu<uint64_t> channel("", 1024000);
45 | 
46 |     for (int i = 0; i < num_iters; ++i) {
47 |         kernel<<<num_blocks, 100>>>(y, x, vec_len, channel.fork());
48 |         //kernel<<<num_blocks, 100>>>(channel.fork());
49 |         for (int j = 0; j < num_blocks * 2; ++j) {
50 |             (void)channel.read<llis::job::InstrumentInfo>();
51 |         }
52 |     }
53 | 
54 |     auto start_time = std::chrono::steady_clock::now();
55 | 
56 |     for (int i = 0; i < num_iters; ++i) {
57 |         kernel<<<num_blocks, 100>>>(y, x, vec_len, channel.fork());
58 |         //kernel<<<num_blocks, 100>>>(channel.fork());
59 |         for (int j = 0; j < num_blocks * 2; ++j) {
60 |             (void)channel.read<llis::job::InstrumentInfo>();
61 |         }
62 |     }
63 | 
64 |     auto end_time = std::chrono::steady_clock::now();
65 | 
66 |     std::cout << "Time elasped: " << std::chrono::duration<double, std::micro>(end_time - start_time).count() / (double)num_iters << " us" << std::endl;
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/shmc_read.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | int main() {
 4 |     llis::ipc::ShmChannelCpuReader channel("test", 64);
 5 |     for (int i = 0; i < 10000; ++i) {
 6 |         int val;
 7 |         channel.read(&val, sizeof(val));
 8 |         if (val != i) {
 9 |             printf("Error! Expected: %d, Actual: %d\n", i, val);
10 |             break;
11 |         }
12 |     }
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/shmc_read_write.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | int main() {
 4 |     llis::ipc::ShmChannelCpuReader channelRead("test");
 5 |     llis::ipc::ShmChannelCpuWriter channelWrite = channelRead.fork();
 6 |     for (int i = 0; i < 10000; ++i) {
 7 |         channelWrite.write(&i, sizeof(i));
 8 |         int val = -1;
 9 |         channelRead.read(&val, sizeof(val));
10 |         if (val != i) {
11 |             printf("Error! Expected: %d, Actual: %d\n", i, val);
12 |             break;
13 |         }
14 |     }
15 | }
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/shmc_read_write_cpu_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | #include <thread>
 4 | 
 5 | void reader(llis::ipc::ShmChannelGpuReader* channel) {
 6 |     for (int i = 0; i < 10000; ++i) {
 7 |         int val;
 8 |         channel->read(&val, sizeof(val));
 9 |         if (val != i) {
10 |             printf("Error! Expected: %d, Actual: %d\n", i, val);
11 |             break;
12 |         }
13 |     }
14 | }
15 | 
16 | __global__ void writer(llis::ipc::ShmChannelGpuWriter channel) {
17 |     for (int i = 0; i < 10000; ++i) {
18 |         channel.write(i);
19 |     }
20 | }
21 | 
22 | int main() {
23 |     llis::ipc::ShmChannelGpuReader channel(64);
24 |     llis::ipc::ShmChannelGpuWriter channel_gpu(&channel);
25 | 
26 |     std::thread reader_thr(reader, &channel);
27 | 
28 |     writer<<<1, 1>>>(std::move(channel_gpu));
29 | 
30 |     reader_thr.join();
31 |     cudaDeviceSynchronize();
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/shmc_read_write_same_proc.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | #include <thread>
 4 | 
 5 | void reader(llis::ipc::ShmChannelCpuReader* channel) {
 6 |     for (int i = 0; i < 10000; ++i) {
 7 |         int val;
 8 |         channel->read(&val, sizeof(val));
 9 |         if (val != i) {
10 |             printf("Error! Expected: %d, Actual: %d\n", i, val);
11 |             break;
12 |         }
13 |     }
14 | }
15 | 
16 | void writer(llis::ipc::ShmChannelCpuWriter* channel) {
17 |     for (int i = 0; i < 10000; ++i) {
18 |         channel->write(i);
19 |     }
20 | }
21 | 
22 | int main() {
23 |     llis::ipc::ShmChannelCpuReader channelRead(64);
24 |     llis::ipc::ShmChannelCpuWriter channelWrite = channelRead.fork();
25 | 
26 |     std::thread reader_thr(reader, &channelRead);
27 |     std::thread writer_thr(writer, &channelWrite);
28 | 
29 |     reader_thr.join();
30 |     writer_thr.join();
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/tests/ipc/shm_channel/shmc_write.cpp:
--------------------------------------------------------------------------------
 1 | #include <llis/ipc/shm_channel.h>
 2 | 
 3 | int main() {
 4 |     llis::ipc::ShmChannelCpuWriter channel("test");
 5 |     for (int i = 0; i < 10000; ++i) {
 6 |         channel.write(&i, sizeof(i));
 7 |     }
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/tests/simple/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #add_executable(test_direct_run_job direct_run_job.cpp $<TARGET_OBJECTS:ipc>)
 2 | #target_link_libraries(test_direct_run_job llis_job_gpu llis_context CUDA::cudart dl rt)
 3 | 
 4 | if(tvm_FOUND)
 5 |     add_executable(tvm_direct_concurrent tvm_direct_concurrent.cpp)
 6 |     target_link_libraries(tvm_direct_concurrent tvm::tvm_runtime llis_context CUDA::cudart pthread)
 7 | 
 8 |     add_executable(tvm_direct_multistream tvm_direct_multistream.cpp)
 9 |     target_link_libraries(tvm_direct_multistream tvm::tvm_runtime llis_context CUDA::cudart pthread)
10 |     target_compile_definitions(tvm_direct_multistream PRIVATE SUBMIT_DIS)
11 |     install(TARGETS tvm_direct_multistream DESTINATION bin)
12 | 
13 |     add_executable(tvm_direct_multistream_pregen tvm_direct_multistream.cpp)
14 |     target_link_libraries(tvm_direct_multistream_pregen tvm::tvm_runtime llis_context CUDA::cudart pthread)
15 |     target_compile_definitions(tvm_direct_multistream_pregen PRIVATE SUBMIT_PREGEN)
16 |     target_compile_options(tvm_direct_multistream_pregen PUBLIC "-fPIC" PUBLIC "-fPIE")
17 |     install(TARGETS tvm_direct_multistream_pregen DESTINATION bin)
18 | endif(tvm_FOUND)
19 | 
20 | add_executable(cuda_sync_benchmark cuda_sync_benchmark.cu)
21 | target_link_libraries(cuda_sync_benchmark CUDA::cudart dl rt)
22 | 
23 | add_executable(cuda_callback_benchmark cuda_callback_benchmark.cu)
24 | target_link_libraries(cuda_callback_benchmark CUDA::cudart dl rt)
25 | 
26 | add_executable(test_mmap_mlock_limit mmap_mlock_limit.cpp)
27 | target_link_libraries(test_mmap_mlock_limit rt)
28 | 


--------------------------------------------------------------------------------
/tests/simple/cuda_callback_benchmark.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #include <cstdio>
 4 | #include <cstdlib>
 5 | #include <chrono>
 6 | #include <mutex>
 7 | #include <thread>
 8 | #include <vector>
 9 | #include <queue>
10 | #include <atomic>
11 | 
12 | std::vector<cudaStream_t> streams;
13 | std::queue<unsigned> noti_queue;
14 | std::atomic_uint noti_queue_num;
15 | std::mutex mtx;
16 | 
17 | __global__ void dummy() {
18 | }
19 | 
20 | void callback(void* stream_id_) {
21 |     unsigned stream_id = (unsigned long)stream_id_;
22 |     std::unique_lock<std::mutex> lock(mtx);
23 |     noti_queue.push(stream_id);
24 |     lock.unlock();
25 |     noti_queue_num.fetch_add(1, std::memory_order_release);
26 | }
27 | 
28 | int main(int argc, char** argv) {
29 |     unsigned num_iter = atoi(argv[1]);
30 |     unsigned num_streams = atoi(argv[2]);
31 | 
32 |     streams.resize(num_streams);
33 |     for (unsigned i = 0; i < num_streams; ++i) {
34 |         cudaStreamCreate(&streams[i]);
35 |     }
36 | 
37 |     std::vector<unsigned> streams_finished;
38 |     streams_finished.resize(num_streams);
39 | 
40 |     unsigned total_finished = 0;
41 |     const unsigned total_num = num_iter * num_streams;
42 | 
43 |     noti_queue_num.store(0);
44 | 
45 |     auto start_time = std::chrono::steady_clock::now();
46 | 
47 |     for (unsigned i = 0; i < num_streams; ++i) {
48 |         dummy<<<1, 1, 0, streams[i]>>>();
49 |         cudaLaunchHostFunc(streams[i], callback, (void*)i);
50 |     }
51 | 
52 |     while (total_finished < total_num) {
53 |         while (noti_queue_num.load(std::memory_order_acquire) == 0);
54 |         noti_queue_num.fetch_sub(1, std::memory_order_release);
55 | 
56 |         std::unique_lock<std::mutex> lock(mtx);
57 |         unsigned stream_id = noti_queue.front();
58 |         noti_queue.pop();
59 |         lock.unlock();
60 | 
61 |         ++streams_finished[stream_id];
62 |         ++total_finished;
63 | 
64 |         if (streams_finished[stream_id] < num_iter) {
65 |             dummy<<<1, 1, 0, streams[stream_id]>>>();
66 |             cudaLaunchHostFunc(streams[stream_id], callback, (void*)stream_id);
67 |         }
68 |     }
69 | 
70 |     auto end_time = std::chrono::steady_clock::now();
71 | 
72 |     double time_elasped = std::chrono::duration<double, std::micro>(end_time - start_time).count();
73 | 
74 |     printf("%f\n", time_elasped);
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/tests/simple/cuda_sync_benchmark.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #include <cstdio>
 4 | #include <cstdlib>
 5 | #include <chrono>
 6 | #include <thread>
 7 | #include <vector>
 8 | 
 9 | __global__ void dummy_cuda_sync() {
10 | }
11 | 
12 | __global__ void dummy_flag(volatile int* flag) {
13 |     __threadfence_system();
14 |     *flag = 1;
15 | }
16 | 
17 | void run_cuda_sync(int num_iter, cudaStream_t stream) {
18 |     for (int i = 0; i < num_iter; ++i) {
19 |         dummy_cuda_sync<<<1, 1, 0, stream>>>();
20 |         cudaStreamSynchronize(stream);
21 |     }
22 | }
23 | 
24 | void run_flag(int num_iter, cudaStream_t stream, volatile int* flag) {
25 |     for (int i = 0; i < num_iter; ++i) {
26 |         *flag = 0;
27 | 
28 |         dummy_flag<<<1, 1, 0, stream>>>(flag);
29 | 
30 |         while (*flag == 0);
31 |     }
32 | }
33 | 
34 | int main(int argc, char** argv) {
35 |     int num_iter = atoi(argv[1]);
36 |     int num_thrs = atoi(argv[2]);
37 | 
38 |     std::vector<cudaStream_t> streams;
39 |     streams.resize(num_thrs);
40 |     for (int i = 0; i < num_thrs; ++i) {
41 |         cudaStreamCreate(&streams[i]);
42 |     }
43 | 
44 |     volatile int* flags;
45 |     cudaMallocHost(&flags, sizeof(int) * num_thrs);
46 | 
47 |     cudaSetDeviceFlags(cudaDeviceScheduleSpin);
48 | 
49 |     double time_cuda_sync;
50 |     double time_flag;
51 | 
52 |     {
53 |     std::vector<std::thread> thrs;
54 |     auto start_time = std::chrono::steady_clock::now();
55 |     for (int i = 0; i < num_thrs; ++i) {
56 |         thrs.emplace_back(run_cuda_sync, num_iter, streams[i]);
57 |     }
58 |     for (auto& thr : thrs) {
59 |         thr.join();
60 |     }
61 |     auto end_time = std::chrono::steady_clock::now();
62 |     time_cuda_sync = std::chrono::duration<double, std::micro>(end_time - start_time).count();
63 |     }
64 | 
65 |     {
66 |     std::vector<std::thread> thrs;
67 |     auto start_time = std::chrono::steady_clock::now();
68 |     for (int i = 0; i < num_thrs; ++i) {
69 |         thrs.emplace_back(run_flag, num_iter, streams[i], flags + i);
70 |     }
71 |     for (auto& thr : thrs) {
72 |         thr.join();
73 |     }
74 |     auto end_time = std::chrono::steady_clock::now();
75 |     time_flag = std::chrono::duration<double, std::micro>(end_time - start_time).count();
76 |     }
77 | 
78 |     printf("%f,%f\n", time_cuda_sync, time_flag);
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/tests/simple/mmap_mlock_limit.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/mman.h>
 2 | #include <fcntl.h>
 3 | #include <unistd.h>
 4 | 
 5 | #include <cstdio>
 6 | #include <cstdlib>
 7 | #include <cerrno>
 8 | 
 9 | int main(int argc, char** argv) {
10 |     unsigned num = atoi(argv[1]);
11 |     unsigned base_size = atoi(argv[2]);
12 | 
13 |     int shm_fd = shm_open("test_mmap_mlock_limit", O_RDWR | O_CREAT, 0600);
14 | 
15 |     if (shm_fd == -1) {
16 |         printf("shm_open error: %d\n", errno);
17 |     }
18 | 
19 |     size_t total_size = 0;
20 |     for (int i = 0; i < num; ++i) {
21 |         size_t this_size = base_size * (i + 1);
22 |         total_size += this_size;
23 | 
24 |         int trunc_ret = ftruncate(shm_fd, total_size);
25 |         if (trunc_ret == -1) {
26 |             printf("ftruncate error: %d, i = %d, total_size: %lu, this_size: %lu\n", errno, i, total_size, this_size);
27 |             break;
28 |         }
29 | 
30 |         void* ptr = mmap(nullptr, this_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, total_size - this_size);
31 |         if (ptr == MAP_FAILED) {
32 |             printf("mmap error: %d, i = %d, total_size: %lu, this_size: %lu\n", errno, i, total_size, this_size);
33 |             break;
34 |         }
35 | 
36 |         int mlock_ret = mlock(ptr, this_size);
37 |         if (mlock_ret == -1) {
38 |             printf("mlock error: %d, i = %d, total_size: %lu, this_size: %lu\n", errno, i, total_size, this_size);
39 |             break;
40 |         }
41 |     }
42 | 
43 |     shm_unlink("test_mmap_mlock_limit");
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(workload_pregen workload_pregen.cpp)
2 | install(TARGETS workload_pregen DESTINATION bin)
3 | 


--------------------------------------------------------------------------------
/tools/calculate_jains_fairness_index.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import argparse
 5 | 
 6 | if __name__ == "__main__":
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('-o', '--output_path', dest='output_path');
 9 |     parser.add_argument('-b', '--input_baseline_path', dest='input_baseline_paths', action='append');
10 |     parser.add_argument('-i', '--input_path', dest='input_path');
11 | 
12 |     args = parser.parse_args()
13 | 
14 |     data = np.genfromtxt(args.input_path, delimiter=',')
15 |     baseline_data = [np.genfromtxt(path, delimiter=',') for path in args.input_baseline_paths]
16 | 
17 |     baseline_latencies = np.array([x[2] for x in baseline_data], dtype=float)
18 | 
19 |     slowdown_factors = data[:, 9::7] / baseline_latencies
20 | 
21 |     fairness_indices = np.sum(slowdown_factors, axis=1, keepdims=True) ** 2. / (np.sum(slowdown_factors ** 2, axis=1, keepdims=True) * len(baseline_latencies))
22 | 
23 |     results = np.concatenate([data[:, 0:1], fairness_indices, slowdown_factors], axis=1)
24 |     
25 |     np.savetxt(args.output_path, results, delimiter=',')
26 | 


--------------------------------------------------------------------------------
/tools/parse_input_kelvin.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def parse_input_kelvin(path, xaxis_name, yaxis_names, model_ids):
 4 |     yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4}
 5 | 
 6 |     data = np.genfromtxt(path, delimiter=',')
 7 | 
 8 |     if xaxis_name == 'throughput':
 9 |         x_data = data[:, 1] / data[:, 2] * 1000000
10 |     else: # sending rate
11 |         x_data = 1000000. / data[:, 0]
12 | 
13 |     y_data = [[data[:, model_id * 7 + yaxis_name2offset[yaxis_name] + 3] / 1000. for yaxis_name in yaxis_names] for model_id in model_ids]
14 | 
15 |     return x_data, y_data
16 | 


--------------------------------------------------------------------------------
/tools/plot_block_exec_times_cdf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import sys
 6 | 
 7 | if __name__ == "__main__":
 8 |     output_path = sys.argv[1]
 9 |     input_paths = sys.argv[2::2]
10 |     labels = sys.argv[3::2]
11 |     num_inputs = len(input_paths)
12 | 
13 |     data = [np.genfromtxt(path, delimiter=' ') for path in input_paths]
14 | 
15 |     for i in range(num_inputs):
16 |         exec_times = data[i][:, 1] - data[i][:, 0]
17 |         print('Mean:', np.mean(exec_times))
18 |         exec_times = np.sort(exec_times)
19 |         plt.plot(exec_times, np.arange(len(exec_times)) / len(exec_times), '-', label=labels[i])
20 | 
21 |     plt.legend()
22 |     #plt.ylim(0, 1000000)
23 | 
24 |     plt.savefig(output_path)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/tools/plot_latency_fairness_threshold.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import sys
 6 | import argparse
 7 | 
 8 | fmts = ['X-', 'o-', '^-', 's-', 'D-', 'v-', 'p-', '*-', 'H-']
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     
13 |     parser.add_argument('-o', '--output_path', dest='output_path');
14 |     parser.add_argument('-i', '--input_path', dest='input_paths', action='append');
15 |     parser.add_argument('-a', '--algo_name', dest='algo_names', action='append');
16 |     parser.add_argument('-l', '--line', dest='lines', type=int, action='append');
17 |     parser.add_argument('-n', '--name', dest='names', action='append');
18 |     parser.add_argument('--yaxis', dest='yaxises', choices=['Mean', 'p50', 'p90', 'p95', 'p99'], action='append', type=str);
19 |     parser.add_argument('-x', '--xlim', dest='xlim', type=float);
20 |     parser.add_argument('-y', '--ylim', dest='ylim', type=float);
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     yaxis_name2offset = {'Mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4}
25 | 
26 |     num_inputs = len(args.input_paths)
27 | 
28 |     data = [np.genfromtxt(path, delimiter=',') for path in args.input_paths]
29 | 
30 |     #plt.rcParams.update({'font.size': 6})
31 |     plt.figure(figsize=(6.4, 3.9552))
32 | 
33 |     for i in range(num_inputs):
34 |         for line, name, fmt in zip(args.lines, args.names, fmts):
35 |             for yaxis in args.yaxises:
36 |                 plt.plot(data[i][:, 0], data[i][:, line * 7 + yaxis_name2offset[yaxis] + 3] / 1000., fmt, label=name, linewidth=1, markersize=2)
37 | 
38 |     plt.gca().invert_xaxis()
39 |     plt.gca().set_xlim(500, 0)
40 |     #plt.xlim(500, 0)
41 |     plt.xlabel('Less Fair  <-     Fairness Threshold     ->  More Fair')
42 |     if len(args.yaxises) == 1:
43 |         plt.ylabel(args.yaxises[0] + ' Latency (ms)')
44 |     else:
45 |         plt.ylabel('Latency (ms)')
46 | 
47 |     plt.legend()
48 |     #if args.xlim is not None:
49 |     #    plt.xlim(0, args.xlim)
50 |     #if args.ylim is not None:
51 |     #    plt.ylim(0, args.ylim)
52 | 
53 |     plt.savefig(args.output_path, bbox_inches='tight')
54 | 
55 | 


--------------------------------------------------------------------------------
/tools/plot_latency_throughput.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import sys
 6 | import argparse
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     
11 |     parser.add_argument('-o', '--output_path', dest='output_path');
12 |     parser.add_argument('-i', '--input_path', dest='input_paths', action='append');
13 |     parser.add_argument('-a', '--algo_name', dest='algo_names', action='append');
14 |     parser.add_argument('-l', '--line', dest='lines', type=int, action='append');
15 |     parser.add_argument('-n', '--name', dest='names', action='append');
16 |     parser.add_argument('--xlim', dest='xlim', type=float);
17 |     parser.add_argument('--ylim', dest='ylim', type=float);
18 |     parser.add_argument('--xaxis', choices=['throughput', 'rate'], type=str);
19 |     parser.add_argument('--yaxis', dest='yaxises', choices=['mean', 'p50', 'p90', 'p95', 'p99'], action='append', type=str);
20 | 
21 |     args = parser.parse_args()
22 | 
23 |     yaxis_name2offset = {'mean': 0, 'p50': 1, 'p90': 2, 'p95': 3, 'p99': 4}
24 | 
25 |     num_inputs = len(args.input_paths)
26 | 
27 |     data = [np.genfromtxt(path, delimiter=',') for path in args.input_paths]
28 | 
29 |     if args.xaxis == 'throughput':
30 |         x_axis = [15000. / x[:, 1] * 1000000. for x in data]
31 |         plt.xlabel('Throughput (req/s)')
32 |     else:
33 |         x_axis = [1000000. / x[:, 0] for x in data]
34 |         plt.xlabel('Sending rate (req/s)')
35 | 
36 |     plt.ylabel('Latency (us)')
37 | 
38 |     for i in range(num_inputs):
39 |         for line, name in zip(args.lines, args.names):
40 |             for yaxis in args.yaxises:
41 |                 plt.errorbar(x_axis[i], data[i][:, line * 7 + yaxis_name2offset[yaxis] + 2], data[i][:, line * 7 + 6 + 2], label=args.algo_names[i] + ' ' + name + ' ' + yaxis, fmt='x-', linewidth=1, markersize=2, elinewidth=0)
42 | 
43 |     plt.legend()
44 |     if args.xlim is not None:
45 |         plt.xlim(0, args.xlim)
46 |     if args.ylim is not None:
47 |         plt.ylim(0, args.ylim)
48 | 
49 |     plt.savefig(args.output_path)
50 | 
51 | 


--------------------------------------------------------------------------------
/tools/plot_resnet18_inception_v3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd "$(dirname "$0")"
 4 | script_path=`pwd`
 5 | cd -
 6 | 
 7 | python3 $script_path/plot_latency_throughput_subplots.py \
 8 |     -o resnet18_inception_v3_prop_newmix3_lns2.pdf \
 9 |     -i resnet18_inception_v3_prop_direct_singlestream_newmix3_3_lns2.txt \
10 |     -a 'CUDA-SS' \
11 |     -i resnet18_inception_v3_prop_direct_multistream_newmix3_1_lns2.txt \
12 |     -a 'CUDA-MS' \
13 |     -i resnet18_inception_v3_prop_mps_newmix3_4_lns2.txt \
14 |     -a 'MPS' \
15 |     -i resnet18_inception_v3_prop_singlestream_newmix3_no_mnist_fifo_2_lns2.txt \
16 |     -a 'Paella-SS' \
17 |     -i resnet18_inception_v3_prop_newmix3_fifo_1_lns2.txt \
18 |     -a 'Paella-MS-jbj' \
19 |     -i resnet18_inception_v3_prop_newmix3_fifo2_1_lns2.txt \
20 |     -a 'Paella-MS-kbk' \
21 |     -i resnet18_inception_v3_prop_newmix3_full3_1_lns2.txt \
22 |     -a 'Paella' \
23 |     -m 0 -n All \
24 |     -m 1 -n ResNet-18 \
25 |     -m 2 -n InceptionV3 \
26 |     --xaxis throughput \
27 |     --yaxis p99 \
28 |     --subplotx 1 \
29 |     --subploty 3 \
30 |     --no-xlabel \
31 |     --legend_subplot 2 \
32 |     --ylim 1000 \
33 |     --height 1.4
34 |     #--height 1.6
35 | 
36 | python3 $script_path/plot_latency_throughput_subplots.py \
37 |     -o resnet18_inception_v3_prop_newmix3_lns1.5.pdf \
38 |     -i resnet18_inception_v3_prop_direct_singlestream_newmix3_3_lns1.5.txt \
39 |     -a 'CUDA-SS' \
40 |     -i resnet18_inception_v3_prop_direct_multistream_newmix3_1_lns1.5.txt \
41 |     -a 'CUDA-MS' \
42 |     -i resnet18_inception_v3_prop_mps_newmix3_4_lns1.5.txt \
43 |     -a 'MPS' \
44 |     -i resnet18_inception_v3_prop_singlestream_newmix3_no_mnist_fifo_2_lns1.5.txt \
45 |     -a 'Paella-SS' \
46 |     -i resnet18_inception_v3_prop_newmix3_fifo_1_lns1.5.txt \
47 |     -a 'Paella-MS-jbj' \
48 |     -i resnet18_inception_v3_prop_newmix3_fifo2_1_lns1.5.txt \
49 |     -a 'Paella-MS-kbk' \
50 |     -i resnet18_inception_v3_prop_newmix3_full3_1_lns1.5.txt \
51 |     -a 'Paella' \
52 |     -m 0 -n All \
53 |     -m 1 -n ResNet-18 \
54 |     -m 2 -n InceptionV3 \
55 |     --xaxis throughput \
56 |     --yaxis p99 \
57 |     --subplotx 1 \
58 |     --subploty 3 \
59 |     --no-title \
60 |     --no-legend \
61 |     --ylim 1000 \
62 |     --height 1.4
63 |     #--height 1.6
64 | 
65 | 


--------------------------------------------------------------------------------
/tools/run_all_direct.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | model_path=$2
 5 | ln_sigma=$3
 6 | suffix=$4
 7 | 
 8 | cd "$(dirname "$0")"/..
 9 | abs_path="`pwd`"
10 | 
11 | cd release/tests/simple
12 | 
13 | echo "**** Running all-direct with ln_sigma=$ln_sigma, suffix=$suffix"
14 | 
15 | for seed in {1,}; do
16 |     #for i in {3000,6000,8000,10000,12000,14000,16000,18000,20000,22000,50000,100000,200000,500000}; do
17 |     #for i in {3000,}; do
18 |     #for i in {25000,50000,100000,200000,500000}; do
19 |     #for i in {0,}; do
20 |     for i in {143,154,167,182,200,222,250,286,333,400,500,667,1000,1053,1111,1176,1250,1333,1429,1538,1667,1818,2000}; do
21 |         #ncu -f --set full --profile-from-start on -o "${res_dir}/all_equal_direct${suffix}.ncu" \
22 |         #nsys profile -o "${res_dir}/all_equal_direct${suffix}.nsys" \
23 |         ./test_tvm_direct_concurrent \
24 |             --iat $i \
25 |             --ln_sigma $ln_sigma \
26 |             --seed $seed \
27 |             --output_path "${res_dir}/all_equal_direct${suffix}.txt" \
28 |             --num_jobs 15000 \
29 |             --concurrency 60 \
30 |             "${model_path}/mnist-8-cuda-pack.so" 0.759 \
31 |             "${model_path}/mobilenetv2-7-cuda-pack.so" 0.0636 \
32 |             "${model_path}/densenet-9-cuda-pack.so" 0.024 \
33 |             "${model_path}/googlenet-9-cuda-pack.so" 0.00289 \
34 |             "${model_path}/inception_v3-cuda-pack.so" 0.00383 \
35 |             "${model_path}/resnet18-v2-7-cuda-pack.so" 0.0657 \
36 |             "${model_path}/resnet34-v2-7-cuda-pack.so" 0.0382 \
37 |             "${model_path}/resnet50-v2-7-cuda-pack.so" 0.0187 \
38 |             "${model_path}/squeezenet1.1-7-cuda-pack.so" 0.02408
39 |             #--num_jobs 500 \
40 |             #--concurrency 15 \
41 |             #"${model_path}/mnist-8-cuda-pack.so" 0.112 \
42 |             #"${model_path}/mobilenetv2-7-cuda-pack.so" 0.111 \
43 |             #"${model_path}/densenet-9-cuda-pack.so" 0.111 \
44 |             #"${model_path}/googlenet-9-cuda-pack.so" 0.111 \
45 |             #"${model_path}/inception_v3-cuda-pack.so" 0.111 \
46 |             #"${model_path}/resnet18-v2-7-cuda-pack.so" 0.111 \
47 |             #"${model_path}/resnet34-v2-7-cuda-pack.so" 0.111 \
48 |             #"${model_path}/resnet50-v2-7-cuda-pack.so" 0.111 \
49 |             #"${model_path}/squeezenet1.1-7-cuda-pack.so" 0.111
50 |     done
51 | done
52 | 
53 | 


--------------------------------------------------------------------------------
/tools/run_fairness_dummy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | for seed in {1,}; do
17 |     for i in {0,}; do
18 |         #for f in {0,1,2,3,4,5,10,15,20,1000000}; do
19 |         #for f in {100,1000,10000}; do
20 |         for f in {2000,4000,6000,8000}; do
21 |             taskset -c 4 ./server server $f 1 &
22 |             SERVER_PID=$!
23 |             sleep 5
24 | 
25 |             ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
26 |                 --server_name server \
27 |                 --iat $i \
28 |                 --ln_sigma $ln_sigma \
29 |                 --concurrency 100 \
30 |                 --num_jobs 3000 \
31 |                 --start_record_num 0 \
32 |                 --seed $seed \
33 |                 --prefix "${res_dir}/dummy_fairness${suffix}" \
34 |                 --fairness $f \
35 |                 --iat_n \
36 |                 --ln_sigma_n \
37 |                 --fairness_n \
38 |                 --fairness_g \
39 |                 "${abs_path}/release/jobs/dummy_short/libjob_dummy_short.so" 0.7 50 \
40 |                 "${abs_path}/release/jobs/dummy_long/libjob_dummy_long.so" 0.3 50
41 |             wait
42 |         done
43 |     done
44 | done
45 | 


--------------------------------------------------------------------------------
/tools/run_fairness_mnist_inception_v3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | for seed in {1,}; do
17 |     #for i in {8000,}; do
18 |     for i in {0,}; do
19 |         #for f in {0.003,0.03,0.3,3,10,15,20,25,30,60,300}; do
20 |         #for f in {0.00003,0.0003}; do
21 |         #for f in {0.03,0.3,3,30,300,3000,30000}; do
22 |         #for f in {0.03,0.3,3,5,10,15,20,25,30,300,3000,10000,15000,20000,30000}; do
23 |         for f in {0.03,30000,}; do
24 |         #for f in {300,3000,30000}; do
25 |         #for f in {10000,15000,20000}; do
26 |         #for f in {0.03,}; do
27 |             taskset -c 4 ./server server $f 1 &
28 |             SERVER_PID=$!
29 |             sleep 5
30 | 
31 |             ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 100 3000 0 mnist_inception_v3_0.7_0.3_i${i}_fairness${suffix}.txt tmp2.txt mnist_inception_v3_0.7_0.3_fair${f}${suffix}_profile_$i mnist_inception_v3_0.7_0.3_fair${f}${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 50 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.3 50
32 |             wait
33 |         done
34 |     done
35 | done
36 | 


--------------------------------------------------------------------------------
/tools/run_inception_v3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | for seed in {1,}; do
17 |     #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000}; do
18 |     #for i in {30000,}; do
19 |     #for i in {0,}; do
20 |     #for i in {30000,40000,50000,60000,80000,100000}; do
21 |     for i in {120000,140000}; do
22 |         taskset -c 4 ./server server 1000000 1 &
23 |         SERVER_PID=$!
24 |         sleep 5
25 | 
26 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 50 3000 0 inception_v3${suffix}.txt tmp2.txt inception_v3${suffix}_profile_$i inception_v3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 50
27 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
28 |         #    --server_name server \
29 |         #    --iat $i \
30 |         #    --ln_sigma $ln_sigma \
31 |         #    --concurrency 50 \
32 |         #    --num_jobs 3000 \
33 |         #    --start_record_num 0 \
34 |         #    --seed $seed \
35 |         #    --prefix inception_v3${suffix} \
36 |         #    --fairness 1000000 \
37 |         #    --iat_n \
38 |         #    --iat_g \
39 |         #    --ln_sigma_n \
40 |         #    "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 50
41 | 
42 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 1 3000 0 inception_v3${suffix}.txt tmp2.txt inception_v3${suffix}_profile_$i inception_v3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 1
43 | 
44 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
45 |             --server_name server \
46 |             --iat $i \
47 |             --ln_sigma $ln_sigma \
48 |             --concurrency 1 \
49 |             --num_jobs 3000 \
50 |             --start_record_num 0 \
51 |             --seed $seed \
52 |             --prefix "${res_dir}/inception_v3${suffix}" \
53 |             --fairness 1000000 \
54 |             --iat_n \
55 |             --iat_g \
56 |             --ln_sigma_n \
57 |             --concurrency_n \
58 |             "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 1 1
59 |         wait
60 |     done
61 | done
62 | 


--------------------------------------------------------------------------------
/tools/run_mnist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | echo "**** Running mnist with ln_sigma=$ln_sigma, suffix=$suffix"
17 | 
18 | for seed in {1,}; do
19 |     #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000}; do
20 |     #for i in {200,}; do
21 |     for i in {0,}; do
22 |         taskset -c 4 ./server server 1000000 1 &
23 |         sleep 5
24 | 
25 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 50 3000 0 mnist${suffix}.txt tmp2.txt mnist${suffix}_profile_$i mnist${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 50
26 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 3 3000 0 mnist${suffix}.txt tmp2.txt mnist${suffix}_profile_$i mnist${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 3
27 | 
28 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
29 |             --server_name server \
30 |             --iat $i \
31 |             --ln_sigma $ln_sigma \
32 |             --start_record_num 0 \
33 |             --seed $seed \
34 |             --prefix "${res_dir}/mnist${suffix}" \
35 |             --fairness 1000000 \
36 |             --iat_n \
37 |             --iat_g \
38 |             --ln_sigma_n \
39 |             --num_jobs 15000 \
40 |             --concurrency 641 \
41 |             "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 641
42 |             #--concurrency 1 \
43 |             #"${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 1
44 |         wait
45 |     done
46 | done
47 | 


--------------------------------------------------------------------------------
/tools/run_mnist_googlenet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | echo "**** Running all with ln_sigma=$ln_sigma, suffix=$suffix"
17 | 
18 | for seed in {1,}; do
19 |     #for i in {3000,6000,8000,10000,12000,14000,16000,18000,20000,22000,50000,100000,200000,500000}; do
20 |     for i in {40000,50000,100000,250000,500000}; do
21 |         taskset -c 4 ./server server 1000000 1 &
22 |         SERVER_PID=$!
23 |         sleep 5
24 | 
25 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 15 3000 0 all_equal${suffix}.txt tmp2.txt all_equal${suffix}_profile_$i all_equal${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.125 15 "${abs_path}/release/jobs/tvm_ultraface320/libjob_tvm_ultraface320.so" 0.125 15 "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_densenet121/libjob_tvm_densenet121.so" 0.125 15 "${abs_path}/release/jobs/tvm_resnet50/libjob_tvm_resnet50.so" 0.125 15 "${abs_path}/release/jobs/tvm_googlenet/libjob_tvm_googlenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_arcfaceresnet100/libjob_tvm_arcfaceresnet100.so" 0.125 15 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.125 15
26 | 
27 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
28 |             --server_name server \
29 |             --iat $i \
30 |             --ln_sigma $ln_sigma \
31 |             --concurrency 1 \
32 |             --num_jobs 1000 \
33 |             --start_record_num 0 \
34 |             --seed $seed \
35 |             --prefix "${res_dir}/mnist_googlenet_0.7_0.3${suffix}" \
36 |             --fairness 1000000 \
37 |             --iat_n \
38 |             --iat_g \
39 |             --ln_sigma_n \
40 |             "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 1 \
41 |             "${abs_path}/release/jobs/tvm_googlenet/libjob_tvm_googlenet.so" 0.3 1
42 |         wait
43 |     done
44 | done
45 | 


--------------------------------------------------------------------------------
/tools/run_mnist_inception_v3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd ..
 8 | abs_path="`pwd`"
 9 | cd -
10 | 
11 | cd ../release/src/server
12 | 
13 | SERVER_PID=0
14 | 
15 | trap "kill $SERVER_PID; exit" INT
16 | 
17 | echo "**** Running mnist_inception_v3 with ln_sigma=$ln_sigma, suffix=$suffix"
18 | 
19 | for seed in {1,}; do
20 |     #for i in {8000,10000,12000,14000,16000,18000,20000}; do
21 |     for i in {25000,33000,50000,100000}; do
22 |         taskset -c 4 ./server server 1000000 1 &
23 |         SERVER_PID=$!
24 |         sleep 5
25 | 
26 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 50 3000 0 mnist_inception_v3_0.7_0.3${suffix}.txt tmp2.txt mnist_inception_v3_0.7_0.3${suffix}_profile_$i mnist_inception_v3_0.7_0.3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 50 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.3 50
27 |         wait
28 |     done
29 | done
30 | 


--------------------------------------------------------------------------------
/tools/run_mnist_resnet50.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | echo "**** Running all with ln_sigma=$ln_sigma, suffix=$suffix"
17 | 
18 | for seed in {1,}; do
19 |     #for i in {250,500,1000,1400,1600,1800,2000,2500,3000}; do
20 |     for i in {2000,2500,3000}; do
21 |         taskset -c 4 ./server server 1000000 1 &
22 |         SERVER_PID=$!
23 |         sleep 5
24 | 
25 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 15 3000 0 all_equal${suffix}.txt tmp2.txt all_equal${suffix}_profile_$i all_equal${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.125 15 "${abs_path}/release/jobs/tvm_ultraface320/libjob_tvm_ultraface320.so" 0.125 15 "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_densenet121/libjob_tvm_densenet121.so" 0.125 15 "${abs_path}/release/jobs/tvm_resnet50/libjob_tvm_resnet50.so" 0.125 15 "${abs_path}/release/jobs/tvm_googlenet/libjob_tvm_googlenet.so" 0.125 15 "${abs_path}/release/jobs/tvm_arcfaceresnet100/libjob_tvm_arcfaceresnet100.so" 0.125 15 "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.125 15
26 | 
27 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
28 |             --server_name server \
29 |             --iat $i \
30 |             --ln_sigma $ln_sigma \
31 |             --concurrency 50 \
32 |             --num_jobs 3000 \
33 |             --start_record_num 0 \
34 |             --seed $seed \
35 |             --prefix "${res_dir}/mnist_resnet50_0.7_0.3${suffix}" \
36 |             --fairness 1000000 \
37 |             --iat_n \
38 |             --iat_g \
39 |             --ln_sigma_n \
40 |             "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 0.7 50 \
41 |             "${abs_path}/release/jobs/tvm_resnet50/libjob_tvm_resnet50.so" 0.3 50
42 |         wait
43 |     done
44 | done
45 | 


--------------------------------------------------------------------------------
/tools/run_mnist_sched_sleep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | echo "**** Running mnist with ln_sigma=$ln_sigma, suffix=$suffix"
17 | 
18 | for seed in {1,}; do
19 |     for i in {0,}; do
20 |         #for s in {0,10,100,1000,10000,}; do # 0,0.05,0.4,3,30 us
21 |         #for s in {100000,1000000}; do # ? us
22 |         for s in {0,}; do
23 |             taskset -c 4 ./server server 1000000 1 $s &
24 |             SERVER_PID=$!
25 |             sleep 5
26 | 
27 |             #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 1 3000 0 mnist_sched_sleep${suffix}.txt tmp2.txt mnist_sched_sleep${s}${suffix}_profile_$i mnist_sched_sleep${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 1
28 | 
29 |             ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
30 |                 --server_name server \
31 |                 --iat $i \
32 |                 --ln_sigma $ln_sigma \
33 |                 --concurrency 1 \
34 |                 --num_jobs 3000 \
35 |                 --start_record_num 0 \
36 |                 --seed $seed \
37 |                 --prefix "${res_dir}/mnist_sched_sleep${suffix}" \
38 |                 --fairness 1000000 \
39 |                 --sched_sleep $s \
40 |                 --sched_sleep_n \
41 |                 --sched_sleep_g \
42 |                 "${abs_path}/release/jobs/tvm_mnist/libjob_tvm_mnist.so" 1 1
43 |             wait
44 |         done
45 |     done
46 | done
47 | 


--------------------------------------------------------------------------------
/tools/run_mobilenet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | for seed in {1,}; do
17 |     #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000}; do
18 |     for i in {0,}; do
19 |         #taskset -c 4 \
20 |         #    ncu -f --set full --profile-from-start off -o "${res_dir}/mobilenet${suffix}_lns${ln_sigma}_con1.ncu" \
21 |                 ./server server 1000000 1 &
22 |         SERVER_PID=$!
23 |         sleep 5
24 | 
25 |         #../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i 50 3000 1000 mobilenet${suffix}.txt tmp2.txt mobilenet${suffix}_profile_$i mobilenet${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 1 50
26 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
27 |             --server_name server \
28 |             --iat $i \
29 |             --ln_sigma $ln_sigma \
30 |             --concurrency 1 \
31 |             --num_jobs 3000 \
32 |             --start_record_num 0 \
33 |             --seed $seed \
34 |             --prefix "${res_dir}/mobilenet${suffix}" \
35 |             --fairness 1000000 \
36 |             --iat_n \
37 |             --iat_g \
38 |             --ln_sigma_n \
39 |             --concurrency_n \
40 |             "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 1 1
41 |         wait
42 |     done
43 | done
44 | 


--------------------------------------------------------------------------------
/tools/run_mobilenet_inception_v3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd "$(dirname "$0")"/..
 8 | abs_path="`pwd`"
 9 | 
10 | cd release/src/server
11 | 
12 | SERVER_PID=0
13 | 
14 | trap "kill $SERVER_PID; exit" INT
15 | 
16 | echo "**** Running mobilenet and inception_v3 with ln_sigma=$ln_sigma, suffix=$suffix"
17 | 
18 | for seed in {1,}; do
19 |     #for i in {3000,6000,8000,10000,12000,14000,16000,18000,20000,22000,50000,100000,200000,500000}; do
20 |     for i in {3000,10000,18000,22000,50000,100000,500000}; do
21 |         taskset -c 4 ./server server 1000000 1 &
22 |         SERVER_PID=$!
23 |         sleep 5
24 | 
25 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi \
26 |             --server_name server \
27 |             --iat $i \
28 |             --ln_sigma $ln_sigma \
29 |             --concurrency 1 \
30 |             --num_jobs 1000 \
31 |             --start_record_num 0 \
32 |             --seed $seed \
33 |             --prefix "${res_dir}/mobilenet_inception_v3_0.7_0.3${suffix}" \
34 |             --fairness 1000000 \
35 |             --iat_n \
36 |             --iat_g \
37 |             --ln_sigma_n \
38 |             "${abs_path}/release/jobs/tvm_mobilenet/libjob_tvm_mobilenet.so" 0.7 1 \
39 |             "${abs_path}/release/jobs/tvm_inception_v3/libjob_tvm_inception_v3.so" 0.3 1
40 |         wait
41 |     done
42 | done
43 | 


--------------------------------------------------------------------------------
/tools/run_ultraface_arcface.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | res_dir=$1
 4 | ln_sigma=$2
 5 | suffix=$3
 6 | 
 7 | cd ..
 8 | abs_path="`pwd`"
 9 | cd -
10 | 
11 | cd ../release/src/server
12 | 
13 | SERVER_PID=0
14 | 
15 | trap "kill $SERVER_PID; exit" INT
16 | 
17 | echo "**** Running ultraface_arcface with ln_sigma=$ln_sigma, suffix=$suffix"
18 | 
19 | for seed in {1,}; do
20 |     #for i in {2000,2200,2400,2600,2800,3000,3200,3400,3600,3800,4000,4200,4400,4600,4800,5000,5200,5400,5600}; do
21 |     #for i in {17000,20000,25000,33000,50000,100000}; do
22 |     for i in {25000,50000,100000}; do
23 |         taskset -c 4 ./server server 1000000 1 &
24 |         SERVER_PID=$!
25 |         sleep 5
26 | 
27 |         ../../tests/client/test_client_concurrent_run_latencies_fixed_num_multi server $i $ln_sigma 30 3000 0 ultraface_arcface_0.7_0.3${suffix}.txt tmp2.txt ultraface_arcface_0.7_0.3${suffix}_profile_$i ultraface_arcface_0.7_0.3${suffix}_timeline${i}.txt $seed "${abs_path}/release/jobs/tvm_ultraface320/libjob_tvm_ultraface320.so" 0.7 30 "${abs_path}/release/jobs/tvm_arcfaceresnet100/libjob_tvm_arcfaceresnet100.so" 0.3 30
28 |         wait
29 |     done
30 | done
31 | 


--------------------------------------------------------------------------------