├── .gitignore
├── .gitmodules
├── 3rdparty
    └── patches
    │   ├── cutlass
    │       └── 0001-Expose-internal-GEMM-fields-for-pllm.patch
    │   ├── mscclpp
    │       ├── mscclpp_assert_fail_workaround.patch
    │       └── no_rlimit_in_docker.patch
    │   └── spdlog
    │       └── cuda12.1_fmt_error_fix.patch
├── LICENSE
├── README.md
├── datasets
    ├── .gitignore
    ├── gen.sh
    ├── traces
    │   ├── lmsys.csv
    │   ├── sharegpt.csv
    │   └── splitwise.csv
    └── utils
    │   ├── gen_const_len_req.py
    │   ├── generate_fix_trace.sh
    │   ├── generate_real_trace.sh
    │   ├── prepare_dataset.py
    │   ├── prepare_real_data.py
    │   ├── prepare_splitwise_data.py
    │   ├── prepare_synthetic_data.py
    │   ├── preprocess_lmsys.py
    │   ├── preprocess_sharegpt.py
    │   └── utils.py
├── figures
    ├── NanoflowLogo.png
    ├── OfflineThroughput.png
    ├── SampleOutput.png
    ├── SystemDesign.png
    ├── async-schedule.png
    ├── feasibility.png
    ├── online-latency.png
    ├── pipeline.gif
    └── serve.png
├── gemv
    ├── .gitignore
    ├── CMakeLists.txt
    ├── include
    │   ├── attention
    │   │   ├── decode.cuh
    │   │   ├── handler.cuh
    │   │   └── prefill.cuh
    │   ├── attention_impl.cuh
    │   ├── decode_attention_decl.cuh
    │   ├── prefill_attention_decl.cuh
    │   └── small_blk_utils.cuh
    ├── python
    │   ├── generate_batch_paged_decode_inst.py
    │   ├── generate_batch_paged_prefill_inst.py
    │   └── literal_map.py
    └── src
    │   ├── bench_batch_decode.cu
    │   ├── bench_batch_prefill.cu
    │   ├── cpu_reference.h
    │   ├── test_batch_decode.cu
    │   ├── test_batch_prefill.cu
    │   └── utils.h
├── groundtruth
    ├── detokenize.py
    ├── test.py
    └── tokenize1.py
├── installAnaconda.sh
├── modelDownload.sh
├── perf.sh
├── pipeline
    ├── CMakeLists.txt
    ├── config_all
    │   ├── .gitignore
    │   ├── llama2-70B
    │   │   ├── 1024.json
    │   │   ├── 2048.json
    │   │   ├── 768.json
    │   │   ├── correct_40G
    │   │   │   ├── 2048.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── fewer_layers
    │   │   │   ├── 1024.json
    │   │   │   ├── 2048.json
    │   │   │   ├── 768.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   ├── non-overlap.json
    │   │   └── pllm-offload.json
    │   ├── llama3-70B
    │   │   ├── 1024.json
    │   │   ├── 2048.json
    │   │   ├── 768.json
    │   │   ├── correct_40G
    │   │   │   ├── 2048.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── fewer_layers
    │   │   │   ├── 1024.json
    │   │   │   ├── 2048.json
    │   │   │   ├── 768.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   └── non-overlap.json
    │   ├── llama3-8B
    │   │   ├── 1024.json
    │   │   ├── correct_40G
    │   │   │   ├── 1024.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── fewer_layers
    │   │   │   ├── 1024.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   └── non-overlap.json
    │   ├── llama3.1-70B
    │   │   ├── 1024.json
    │   │   ├── 2048.json
    │   │   ├── 768.json
    │   │   ├── correct_40G
    │   │   │   ├── 2048.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── fewer_layers
    │   │   │   ├── 1024.json
    │   │   │   ├── 2048.json
    │   │   │   ├── 768.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   └── non-overlap.json
    │   ├── llama3.1-8B
    │   │   ├── 1024.json
    │   │   ├── correct_40G
    │   │   │   ├── 1024.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── fewer_layers
    │   │   │   ├── 1024.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   └── non-overlap.json
    │   ├── mixtral-8-7B
    │   │   ├── 1024.json
    │   │   ├── correct_40G
    │   │   │   ├── 1024.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── fewer_layers
    │   │   │   ├── 1024.json
    │   │   │   ├── nanobatch-only.json
    │   │   │   └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   └── non-overlap.json
    │   └── qwen2-72B
    │   │   ├── 2048.json
    │   │   ├── correct_40G
    │   │       ├── 2048.json
    │   │       ├── nanobatch-only.json
    │   │       └── non-overlap.json
    │   │   ├── fewer_layers
    │   │       ├── 2048.json
    │   │       ├── nanobatch-only.json
    │   │       └── non-overlap.json
    │   │   ├── nanobatch-only.json
    │   │   └── non-overlap.json
    ├── eval
    │   ├── .gitignore
    │   ├── baseline_data.py
    │   ├── calculate_speedup.py
    │   ├── clean_all.sh
    │   ├── eval-ablation
    │   │   ├── clean.sh
    │   │   └── run.py
    │   ├── eval-fix-offline
    │   │   ├── clean.sh
    │   │   └── run.py
    │   ├── eval-real-offline
    │   │   ├── clean.sh
    │   │   └── run.py
    │   ├── eval-real-online-1024
    │   │   ├── clean.sh
    │   │   └── run.py
    │   ├── eval-real-online-2048
    │   │   ├── auto_datapoint.py
    │   │   ├── clean.sh
    │   │   └── run.py
    │   ├── eval-real-online-768
    │   │   ├── clean.sh
    │   │   └── run.py
    │   ├── eval.sh
    │   ├── eval_output_example.py
    │   ├── merge_results.py
    │   ├── plot_all.py
    │   ├── run_all.sh
    │   ├── run_offline.sh
    │   └── summarize.py
    ├── include
    │   ├── allocManager.cuh
    │   ├── comm.h
    │   ├── computeBound.cuh
    │   ├── config.h
    │   ├── cutlassGemmWrapper.cuh
    │   ├── cutlassGemmWrapperImpl.cuh
    │   ├── dualWrapper.cuh
    │   ├── eventManager.cuh
    │   ├── gemmFactory.cuh
    │   ├── gemmShape.cuh
    │   ├── gemvConfig.cuh
    │   ├── gemvDependency.cuh
    │   ├── gemvWrapper.cuh
    │   ├── helper.h
    │   ├── netWrapper.cuh
    │   ├── networkManager.cuh
    │   ├── offloadKernel.cuh
    │   ├── operatorWrapper.cuh
    │   ├── otherWrapper.cuh
    │   ├── pipeline.h
    │   ├── rms_norm.cuh
    │   ├── sleep.cuh
    │   ├── small_cuda_operator.cuh
    │   ├── tensor.cuh
    │   ├── tensorLogger.cuh
    │   ├── tensorManager.cuh
    │   └── vortexData.cuh
    ├── perf
    │   └── test.py
    ├── src
    │   ├── comm.cu
    │   ├── comm_test.cu
    │   ├── computeBound.cu
    │   ├── computeMain.cu
    │   ├── gemvDependency.cu
    │   ├── generate-gemm
    │   │   ├── .gitignore
    │   │   ├── Makefile
    │   │   ├── gemmFactory.in
    │   │   └── genGEMM.py
    │   ├── load_config.cu
    │   ├── networkManager.cu
    │   ├── offloadKernel.cu
    │   ├── pipeline.cu
    │   ├── pipeline_local.cu
    │   ├── pipeline_nonoverlap.cu
    │   ├── pipeline_nonoverlap_local.cu
    │   ├── pipeline_nonoverlap_nanobatch.cu
    │   ├── pybind.cu
    │   ├── pythonProfiling.py
    │   ├── run.sh
    │   ├── sleep.cu
    │   ├── small_cuda_operator.cu
    │   ├── tensorLogger.cu
    │   ├── test.py
    │   ├── test_dual.cu
    │   └── vortexData.cu
    └── utils
    │   ├── .gitignore
    │   ├── frontend.py
    │   ├── gen_req.py
    │   ├── kv_cache.py
    │   ├── listToCSV.py
    │   ├── plotSchedule.py
    │   ├── plot_trend.py
    │   ├── pybindUtil.py
    │   ├── request_info.py
    │   ├── scheduler.py
    │   ├── serve.py
    │   ├── serve_8B.py
    │   ├── serve_8B_3_1.py
    │   ├── weightLoader.py
    │   ├── weightSaver.py
    │   └── weightSaver_3_1.py
├── serve.sh
└── setup.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.i
 2 | *.ii
 3 | *.gpu
 4 | *.ptx
 5 | *.cubin
 6 | *.fatbin
 7 | *.out
 8 | *.txt
 9 | !CMakeLists.txt
10 | *.csv
11 | /temp
12 | *.a
13 | *.o
14 | *.nsys-rep
15 | *.sqlite
16 | /cutlassProfile/tools
17 | *.sass
18 | cutlass_profiler
19 | build
20 | .vscode
21 | *.pdf
22 | *.png
23 | __pycache__
24 | *.prof
25 | new-small-gemv/src/generated/
26 | pipeline/src/generated/
27 | /detailedGemmKernelPerf/CMakeFiles/
28 | *.json
29 | *.json.gz
30 | *.cmake
31 | *.stat
32 | trace_processor
33 | *.tar.gz
34 | !pipeline/config/*
35 | Anaconda3-2024.02-1-Linux-x86_64.sh
36 | *.log
37 | *.csv
38 | *.req_*
39 | *.schedule
40 | !figures/*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "3rdparty/flashinfer"]
 2 | 	path = 3rdparty/flashinfer
 3 | 	url = https://github.com/happierpig/flashinfer-ai.git
 4 | 
 5 | [submodule "3rdparty/nvbench"]
 6 | 	path = 3rdparty/nvbench
 7 | 	url = https://github.com/NVIDIA/nvbench.git
 8 | [submodule "3rdparty/gtest"]
 9 | 	path = 3rdparty/gtest
10 | 	url = https://github.com/google/googletest.git
11 | [submodule "3rdparty/mscclpp"]
12 | 	path = 3rdparty/mscclpp
13 | 	url = https://github.com/microsoft/mscclpp.git
14 | [submodule "3rdparty/spdlog"]
15 | 	path = 3rdparty/spdlog
16 | 	url = https://github.com/gabime/spdlog.git
17 | [submodule "3rdparty/cutlass"]
18 | 	path = 3rdparty/cutlass
19 | 	url = https://github.com/NVIDIA/cutlass.git
20 | 


--------------------------------------------------------------------------------
/3rdparty/patches/cutlass/0001-Expose-internal-GEMM-fields-for-pllm.patch:
--------------------------------------------------------------------------------
 1 | From 550e5c626f9f86ac8100155d767f52fc8c4dc815 Mon Sep 17 00:00:00 2001
 2 | From: Alkaid <zgf574564920@gmail.com>
 3 | Date: Tue, 26 Mar 2024 14:50:39 -0700
 4 | Subject: [PATCH] Expose internal GEMM fields for pllm
 5 | 
 6 | ---
 7 |  include/cutlass/gemm/device/gemm.h | 2 --
 8 |  1 file changed, 2 deletions(-)
 9 | 
10 | diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
11 | index f0226354..805e3d2c 100644
12 | --- a/include/cutlass/gemm/device/gemm.h
13 | +++ b/include/cutlass/gemm/device/gemm.h
14 | @@ -346,8 +346,6 @@ class Gemm {
15 |      }
16 |    };
17 |  
18 | -private:
19 | -
20 |    /// Kernel parameters object
21 |    typename GemmKernel::Params params_;
22 |  
23 | -- 
24 | 2.44.0
25 | 
26 | 


--------------------------------------------------------------------------------
/3rdparty/patches/mscclpp/mscclpp_assert_fail_workaround.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp
 2 | index 9ad116f..33325c8 100644
 3 | --- a/include/mscclpp/poll_device.hpp
 4 | +++ b/include/mscclpp/poll_device.hpp
 5 | @@ -14,10 +14,10 @@
 6 |  #define __assert_fail(__assertion, __file, __line, __function) ;
 7 |  #else  // !defined(NDEBUG)
 8 |  #if defined(MSCCLPP_DEVICE_HIP)
 9 | -extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
10 | +extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
11 |                                           const char *__function);
12 |  #else   // !defined(MSCCLPP_DEVICE_HIP)
13 | -extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
14 | +extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
15 |                                           const char *__function) __THROW;
16 |  #endif  // !defined(MSCCLPP_DEVICE_HIP)
17 |  #endif  // NDEBUG
18 | 


--------------------------------------------------------------------------------
/3rdparty/patches/mscclpp/no_rlimit_in_docker.patch:
--------------------------------------------------------------------------------
 1 | For Runpod (i.e., docker), it does not allow rlimit.
 2 | diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc
 3 | index c9cea10..ed3e956 100644
 4 | --- a/src/bootstrap/bootstrap.cc
 5 | +++ b/src/bootstrap/bootstrap.cc
 6 | @@ -278,7 +278,6 @@ void TcpBootstrap::Impl::bootstrapRoot() {
 7 |  
 8 |    std::memset(rankAddresses.data(), 0, sizeof(SocketAddress) * nRanks_);
 9 |    std::memset(rankAddressesRoot.data(), 0, sizeof(SocketAddress) * nRanks_);
10 | -  setFilesLimit();
11 |  
12 |    TRACE(MSCCLPP_INIT, "BEGIN");
13 |    /* Receive addresses from all ranks */
14 | 


--------------------------------------------------------------------------------
/3rdparty/patches/spdlog/cuda12.1_fmt_error_fix.patch:
--------------------------------------------------------------------------------
 1 | fmt upstream repo fixed this issue (https://github.com/fmtlib/fmt/pull/1818).
 2 | But spdlog hasn't updated fmt for years (https://github.com/gabime/spdlog/issues/1662)
 3 | Observed to impact cuda 12.1 but not 12.4
 4 | diff --git a/include/spdlog/fmt/bundled/core.h b/include/spdlog/fmt/bundled/core.h
 5 | index b51c1406..27b8c3f6 100644
 6 | --- a/include/spdlog/fmt/bundled/core.h
 7 | +++ b/include/spdlog/fmt/bundled/core.h
 8 | @@ -241,7 +241,7 @@
 9 |  #  if defined(__cpp_nontype_template_args) &&                  \
10 |        ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) || \
11 |         __cpp_nontype_template_args >= 201911L) &&              \
12 | -      !defined(__NVCOMPILER) && !defined(__LCC__)
13 | +      !defined(__NVCOMPILER) && !defined(__LCC__) && !defined(__NVCC__)
14 |  #    define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
15 |  #  else
16 |  #    define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
17 | 
18 | 


--------------------------------------------------------------------------------
/datasets/.gitignore:
--------------------------------------------------------------------------------
1 | !/traces/lmsys.csv
2 | !/traces/splitwise.csv
3 | !/traces/sharegpt.csv
4 | 


--------------------------------------------------------------------------------
/datasets/gen.sh:
--------------------------------------------------------------------------------
 1 | cd ./utils
 2 | ./generate_fix_trace.sh
 3 | 
 4 | rates=$(seq 0 50)
 5 | for rate in $rates; do
 6 |   echo "Generating traces for rate: $rate"
 7 |   bash ./generate_real_trace.sh $rate &
 8 | done
 9 | 
10 | wait
11 | 
12 | echo "All traces generated"
13 | 


--------------------------------------------------------------------------------
/datasets/utils/gen_const_len_req.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | prefill_len = int(sys.argv[1])
 4 | decode_len = int(sys.argv[2])
 5 | request_rate = int(sys.argv[3])
 6 | output_prefix = sys.argv[4]
 7 | 
 8 | if request_rate == 0:
 9 |     request_interval = 0
10 | else:
11 |     request_interval = 1 / request_rate
12 | 
13 | with open(f"{output_prefix}/{prefill_len}-{decode_len}-{request_rate}.csv", 'w') as f:
14 |     for i in range(100000):
15 |         f.write(f"{i},{prefill_len},{decode_len},{request_interval*i}\n")


--------------------------------------------------------------------------------
/datasets/utils/generate_fix_trace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | tracedir=../traces
 3 | mkdir -p ${tracedir}
 4 | fixTraceDir=${tracedir}/fixed
 5 | mkdir -p ${fixTraceDir}
 6 | 
 7 | 
 8 | input_output_pairs=(
 9 |   "512 512"
10 |   "1024 512"
11 |   "512 1024"
12 |   "128 1024"
13 |   "512 2"
14 | )
15 | 
16 | for pair in "${input_output_pairs[@]}"; do
17 | 
18 |   read input_len output_len <<< "$pair"
19 |   echo "Generating trace for input_len: $input_len, output_len: $output_len"
20 |   python3 gen_const_len_req.py $input_len $output_len 0 ${fixTraceDir}
21 | 
22 | done


--------------------------------------------------------------------------------
/datasets/utils/generate_real_trace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | tracedir=../traces
 4 | splitwise_output_dir=${tracedir}/splitwise
 5 | lmsys_output_dir=${tracedir}/lmsys
 6 | sharegpt_output_dir=${tracedir}/sharegpt
 7 | mkdir -p ${splitwise_output_dir}
 8 | mkdir -p ${lmsys_output_dir}
 9 | mkdir -p ${sharegpt_output_dir}
10 | 
11 | minute=5
12 | num_requests=15000
13 | 
14 | # get the argument as rate
15 | rate=$1
16 | 
17 | # splitwise
18 | filename="${splitwise_output_dir}/splitwise-rate-${rate}-${minute}min-reqs-${num_requests}-exp-delay.csv"
19 | safe_filename=$(echo "$filename" | sed 's/\([0-9]\)\.\([0-9]\)/\1_\2/')
20 | python3 ./prepare_dataset.py \
21 |   --output ${safe_filename} \
22 |   --request-rate ${rate} \
23 |   --time-delay-dist exponential_dist \
24 |   --tokenizer  lmsys/longchat-13b-16k\
25 |    splitwise \
26 |    --num-requests 17563 \
27 |    --trace-path ${tracedir}/splitwise.csv \
28 |    --mode length
29 | # lmsys
30 | filename="${lmsys_output_dir}/lmsys-rate-${rate}-${minute}min-reqs-${num_requests}-exp-delay.csv"
31 | safe_filename=$(echo "$filename" | sed 's/\([0-9]\)\.\([0-9]\)/\1_\2/')
32 | python3 ./prepare_dataset.py \
33 |   --output ${safe_filename} \
34 |   --request-rate ${rate} \
35 |   --time-delay-dist exponential_dist \
36 |   --tokenizer  lmsys/longchat-13b-16k\
37 |    splitwise \
38 |    --num-requests 50000 \
39 |    --trace-path ${tracedir}/lmsys.csv \
40 |    --mode length
41 | # sharegpt
42 | filename="${sharegpt_output_dir}/sharegpt-rate-${rate}-${minute}min-reqs-${num_requests}-exp-delay.csv"
43 | safe_filename=$(echo "$filename" | sed 's/\([0-9]\)\.\([0-9]\)/\1_\2/')
44 | python3 ./prepare_dataset.py \
45 |   --output ${safe_filename} \
46 |   --request-rate ${rate} \
47 |   --time-delay-dist exponential_dist \
48 |   --tokenizer  lmsys/longchat-13b-16k\
49 |    splitwise \
50 |    --num-requests 50000 \
51 |    --trace-path ${tracedir}/sharegpt.csv \
52 |    --mode length


--------------------------------------------------------------------------------
/datasets/utils/prepare_real_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import click
 4 | from utils import dataset_dump, get_list_of_delays
 5 | 
 6 | 
 7 | @click.command()
 8 | @click.option("--dataset",
 9 |               required=True,
10 |               type=str,
11 |               help='Dataset path used for the test.')
12 | @click.option(
13 |     "--num-requests",
14 |     type=int,
15 |     default=None,
16 |     help=
17 |     'Number of requests to be generated. Default is dataset length. Will be capped to min(dataset, num_requests).'
18 | )
19 | @click.option(
20 |     "--op-tokens-per-word",
21 |     type=float,
22 |     default=1.3,
23 |     help=
24 |     'Specify op tokens/word ratio. Useful to have model generate exactly as many tokens as needed by the dataset.'
25 | )
26 | @click.option("--max-input-len",
27 |               type=int,
28 |               default=500000,
29 |               help='Specify max input length.')
30 | @click.pass_obj
31 | def dataset(root_args, **kwargs):
32 |     """Prepare dataset from real dataset."""
33 |     prompt_cnt = 0
34 |     input_ids = []
35 |     output_lens = []
36 |     ratio = []
37 | 
38 |     with open(kwargs['dataset'], 'r') as f:
39 |         data_dict = json.load(f)
40 | 
41 |     if kwargs['num_requests'] is None:
42 |         kwargs['num_requests'] = len(data_dict)
43 |     else:
44 |         kwargs['num_requests'] = min(kwargs['num_requests'], len(data_dict))
45 | 
46 |     for req in data_dict:
47 |         prompt = req['input'] + ' ' + req['instruction']
48 |         output = req['output']
49 |         line = root_args.tokenizer.encode(prompt)
50 |         if len(line) > kwargs['max_input_len']:
51 |             continue
52 | 
53 |         prompt_cnt += 1
54 |         if prompt_cnt > kwargs['num_requests']:
55 |             break
56 | 
57 |         input_ids.append(line)
58 |         output_lens.append(
59 |             int(len(output.split(' ')) * kwargs['op_tokens_per_word']))
60 | 
61 |         prompt_tokens = len(line)
62 |         prompt_words = len(prompt.split())
63 |         ratio.append(prompt_tokens / prompt_words)
64 | 
65 |     delays = get_list_of_delays(root_args.time_delay_dist,
66 |                                 root_args.mean_time_bet_reqs, len(input_ids),
67 |                                 root_args.random_seed)
68 | 
69 |     dataset_dump(
70 |         input_ids, output_lens, delays, {
71 |             "workload_type": "dataset",
72 |             "tokenizer": root_args.tokenizer.__class__.__name__,
73 |             "num_requests": kwargs['num_requests'],
74 |             "delay_distr": root_args.time_delay_dist,
75 |             "request_rate": root_args.request_rate
76 |         }, root_args.output)


--------------------------------------------------------------------------------
/datasets/utils/prepare_synthetic_data.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from utils import (dataset_dump, gen_random_tokens, get_list_of_delays,
 3 |                          get_norm_dist_tokens)
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.option("--num-requests",
 8 |               required=True,
 9 |               type=int,
10 |               help='Number of requests to be generated')
11 | @click.option('--input-mean',
12 |               required=True,
13 |               type=int,
14 |               help='normal dist mean for input tokens')
15 | @click.option('--input-stdev',
16 |               required=True,
17 |               type=int,
18 |               help='normal dist stdev for input tokens')
19 | @click.option('--output-mean',
20 |               required=True,
21 |               type=int,
22 |               help='normal dist mean for output tokens')
23 | @click.option('--output-stdev',
24 |               required=True,
25 |               type=int,
26 |               help='normal dist stdev for output tokens')
27 | @click.pass_obj
28 | def token_norm_dist(root_args, **kwargs):
29 |     """Prepare dataset by generating random tokens."""
30 |     input_ids = []
31 |     input_lens = []
32 |     output_lens = []
33 | 
34 |     input_lens = get_norm_dist_tokens(kwargs['input_mean'],
35 |                                       kwargs['input_stdev'],
36 |                                       kwargs['num_requests'],
37 |                                       root_args.random_seed)
38 | 
39 |     num_reqs = len(input_lens)
40 |     output_lens = get_norm_dist_tokens(kwargs['output_mean'],
41 |                                        kwargs['output_stdev'], num_reqs,
42 |                                        root_args.random_seed)
43 |     delays = get_list_of_delays(root_args.time_delay_dist,
44 |                                 root_args.mean_time_bet_reqs, num_reqs,
45 |                                 root_args.random_seed)
46 | 
47 |     input_ids = gen_random_tokens(input_lens, root_args.tokenizer,
48 |                                   root_args.random_seed)
49 | 
50 |     dataset_dump(
51 |         input_ids, output_lens, delays, {
52 |             "workload_type": "token-norm-dist",
53 |             "input_mean": kwargs['input_mean'],
54 |             "input_stdev": kwargs['input_stdev'],
55 |             "output_mean": kwargs['output_mean'],
56 |             "output_stdev": kwargs['output_stdev'],
57 |             "num_requests": kwargs['num_requests'],
58 |             "delay_distr": root_args.time_delay_dist,
59 |             "request_rate": root_args.request_rate,
60 |             "tokenize_vocabsize": root_args.tokenizer.vocab_size
61 |         }, root_args.output)


--------------------------------------------------------------------------------
/datasets/utils/preprocess_lmsys.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from transformers import AutoTokenizer
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | f = open("lmsys.csv", "w")
 7 | f.write("TIMESTAMP,ContextTokens,GeneratedTokens\n")
 8 | 
 9 | num_samples = 100000
10 | 
11 | # If the dataset is gated/private, make sure you have run huggingface-cli login
12 | dataset = load_dataset("lmsys/lmsys-chat-1m")
13 | 
14 | # sample num_samples samples from the dataset randomly
15 | dataset = dataset.shuffle(seed=42)
16 | dataset = dataset["train"]
17 | 
18 | tokenizer = AutoTokenizer.from_pretrained("lmsys/longchat-13b-16k")
19 | 
20 | collected_samples = 0
21 | for i in tqdm(range(len(dataset))):
22 |     if len(dataset[i]["conversation"]) < 2:
23 |         continue
24 |     if dataset[i]["conversation"][0]["role"] != "user" or dataset[i]["conversation"][1]["role"] != "assistant":
25 |         continue
26 |     input_tokens = tokenizer.encode(dataset[i]["conversation"][0]['content'])
27 |     output_tokens = tokenizer.encode(dataset[i]["conversation"][1]['content'])
28 |     f.write(f"XXX,{len(input_tokens)},{len(output_tokens)}\n")
29 |     collected_samples += 1
30 |     if collected_samples >= num_samples:
31 |         break
32 | 
33 | #   ContextTokens       92.20294
34 | #   GeneratedTokens    207.39722
35 | #   dtype: float64


--------------------------------------------------------------------------------
/figures/NanoflowLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/NanoflowLogo.png


--------------------------------------------------------------------------------
/figures/OfflineThroughput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/OfflineThroughput.png


--------------------------------------------------------------------------------
/figures/SampleOutput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/SampleOutput.png


--------------------------------------------------------------------------------
/figures/SystemDesign.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/SystemDesign.png


--------------------------------------------------------------------------------
/figures/async-schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/async-schedule.png


--------------------------------------------------------------------------------
/figures/feasibility.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/feasibility.png


--------------------------------------------------------------------------------
/figures/online-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/online-latency.png


--------------------------------------------------------------------------------
/figures/pipeline.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/pipeline.gif


--------------------------------------------------------------------------------
/figures/serve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/serve.png


--------------------------------------------------------------------------------
/gemv/.gitignore:
--------------------------------------------------------------------------------
1 | /src/generated/*


--------------------------------------------------------------------------------
/gemv/include/attention_impl.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2024 by FlashInfer team.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *   http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef FLASHINFER_ATTENTION_IMPL_CUH_
17 | #define FLASHINFER_ATTENTION_IMPL_CUH_
18 | 
19 | #include "flashinfer/attention/cascade.cuh"
20 | 
21 | #include "attention/decode.cuh"
22 | #include "attention/prefill.cuh"
23 | 
24 | #endif  // FLASHINFER_ATTENTION_IMPL_CUH_
25 | 


--------------------------------------------------------------------------------
/gemv/include/small_blk_utils.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef FLASHINFER_SMALL_BLK_UTILS
 2 | #define FLASHINFER_SMALL_BLK_UTILS
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | #include <iostream>
 7 | #include <sstream>
 8 | #include <stdexcept>
 9 | #include <vector>
10 | 
11 | namespace flashinfer
12 | {
13 | /*!
14 |      * \brief The type of launch. Whether use small blk to run the kernel or not.
15 |      */
16 | enum class LaunchType
17 | {
18 | 	// Use all blk to launch, default flashinfer
19 | 	AllBlk = 0,
20 | 	// Use constrained sm to run. Need specify how many sm.
21 | 	SmallBlk = 1,
22 | };
23 | 
24 | inline std::string LaunchTypeToString(const LaunchType& lType) {
25 |   switch (lType) {
26 |     case LaunchType::AllBlk:
27 |       return "All";
28 |     case LaunchType::SmallBlk:
29 |       return "Small";
30 |     default:
31 |       return "Unknown";
32 |   }
33 | }
34 | 
35 | } // namespace flashinfer
36 | 
37 | #define DISPATCH_LAUNCH(ltype, LTYPE, ...)                    \
38 | 	switch(ltype) {                                           \
39 | 	case LaunchType::AllBlk: {                                \
40 | 		constexpr LaunchType LTYPE = LaunchType::AllBlk;      \
41 | 		__VA_ARGS__                                           \
42 | 		break;                                                \
43 | 	}                                                         \
44 | 	case LaunchType::SmallBlk: {                              \
45 | 		constexpr LaunchType LTYPE = LaunchType::SmallBlk;    \
46 | 		__VA_ARGS__                                           \
47 | 		break;                                                \
48 | 	}                                                         \
49 | 	default: {                                                \
50 | 		std::ostringstream err_msg;                           \
51 | 		err_msg << "Unsupported launch type: " << int(ltype); \
52 | 		throw std::invalid_argument(err_msg.str());           \
53 | 	}                                                         \
54 | 	}
55 | 
56 | #endif // FLASHINFER_SMALL_BLK_UTILS


--------------------------------------------------------------------------------
/gemv/python/generate_batch_paged_decode_inst.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 by FlashInfer team.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |   http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | import sys
18 | import re
19 | from literal_map import (
20 |     kv_layout_literal,
21 |     pos_encoding_mode_literal,
22 |     dtype_literal,
23 |     idtype_literal,
24 |     launchtype_literal,
25 | )
26 | from pathlib import Path
27 | 
28 | 
29 | def get_cu_file_str(
30 |     group_size, head_dim, kv_layout, pos_encoding_mode, dtype_in, dtype_out, idtype, ltype
31 | ):
32 |     content = """#include <attention_impl.cuh>
33 | 
34 | namespace flashinfer {{
35 | 
36 | constexpr PageStorage page_storage = PageStorage::kIndices;
37 | 
38 | template cudaError_t BatchDecodeWithPagedKVCacheDispatched<{group_size}, {head_dim}, page_storage, {kv_layout}, {pos_encoding_mode}, {dtype_in}, {dtype_out}, {idtype}, {ltype}>(
39 |     {dtype_in}* q, {idtype}* q_offset,
40 |     paged_kv_t<page_storage, {kv_layout}, {dtype_in}, {idtype}> paged_kv,
41 |     kv_partition_info_t<{idtype}> kv_partition_info,
42 |     {dtype_out}* o, {dtype_out}* tmp, float* lse,
43 |     size_t sm_blk, float sm_scale, float rope_scale,
44 |     float rope_theta, cudaStream_t stream);
45 | 
46 | }}
47 |     """.format(
48 |         kv_layout=kv_layout_literal[int(kv_layout)],
49 |         group_size=group_size,
50 |         head_dim=head_dim,
51 |         pos_encoding_mode=pos_encoding_mode_literal[int(pos_encoding_mode)],
52 |         dtype_in=dtype_literal[dtype_in],
53 |         dtype_out=dtype_literal[dtype_out],
54 |         idtype=idtype_literal[idtype],
55 |         ltype=launchtype_literal[ltype],
56 |     )
57 |     return content
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     pattern = (
62 |         r"batch_paged_decode_group_([0-9]+)_head_([0-9]+)_layout_([0-9]+)_posenc_([0-9]+)_"
63 |         r"dtypein_([a-z0-9]+)_dtypeout_([a-z0-9]+)_idtype_([a-z0-9]+)_launchtype_([a-z]+)\.cu"
64 |     )
65 | 
66 |     compiled_pattern = re.compile(pattern)
67 |     path = Path(sys.argv[1])
68 |     fname = path.name
69 |     match = compiled_pattern.match(fname)
70 |     with open(path, "w") as f:
71 |         f.write(get_cu_file_str(*match.groups()))
72 | 


--------------------------------------------------------------------------------
/gemv/python/literal_map.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 by FlashInfer team.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |   http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | kv_layout_literal = {
18 |     0: "QKVLayout::kNHD",
19 |     1: "QKVLayout::kHND",
20 | }
21 | 
22 | pos_encoding_mode_literal = {
23 |     0: "PosEncodingMode::kNone",
24 |     1: "PosEncodingMode::kRoPELlama",
25 |     2: "PosEncodingMode::kALiBi",
26 | }
27 | 
28 | dtype_literal = {
29 |     "f16": "half",
30 |     "bf16": "nv_bfloat16",
31 |     "e4m3": "__nv_fp8_e4m3",
32 |     "e5m2": "__nv_fp8_e5m2",
33 | }
34 | 
35 | idtype_literal = {
36 |     "i32": "int32_t",
37 |     "u32": "uint32_t",
38 |     "i64": "int64_t",
39 |     "u64": "uint64_t",
40 | }
41 | 
42 | bool_literal = {
43 |     0: "false",
44 |     1: "true",
45 | }
46 | 
47 | launchtype_literal = {
48 |     "all": "LaunchType::AllBlk",
49 |     "small": "LaunchType::SmallBlk",
50 | }


--------------------------------------------------------------------------------
/groundtruth/detokenize.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | 
 4 | # Set the HF_HOME environment variable
 5 | os.environ['HF_HOME'] = '/code/hf'
 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaModel, LlamaForCausalLM
 7 | # import torch
 8 | 
 9 | 
10 | 
11 | model_name_or_path = "meta-llama/Llama-2-70b-chat-hf"
12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
13 | 
14 | 
15 | # Create a partial model with a fraction of layers, e.g., the first 12 layers
16 | 
17 | 
18 | # Prepare input
19 | input_text = "Hi, "
20 | inputs = tokenizer(input_text, return_tensors="pt")
21 | input_ids = inputs['input_ids']
22 | input_ids = input_ids
23 | 
24 | output_text = tokenizer.decode([1, 29915, 29885, 1811], skip_special_tokens=True)
25 | print(output_text)
26 | 


--------------------------------------------------------------------------------
/groundtruth/test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import sys
 4 | 
 5 | # Set the HF_HOME environment variable
 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaModel, LlamaForCausalLM
 7 | import torch
 8 | # import torch
 9 | 
10 | 
11 | 
12 | model_name_or_path = "meta-llama/Llama-2-70b-chat-hf"
13 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
14 | 
15 | # Load the full model
16 | model = LlamaForCausalLM.from_pretrained(model_name_or_path)
17 | 
18 | 
19 | # Create a partial model with a fraction of layers, e.g., the first 12 layers
20 | 
21 | 
22 | # Prepare input
23 | input_text = "Hi, I'm"
24 | inputs = tokenizer(input_text, return_tensors="pt")
25 | input_ids = inputs['input_ids']
26 | print(input_ids.size())
27 | 
28 | # Perform inference and decode the output
29 | output = model.generate(
30 |     input_ids,
31 |     max_new_tokens=5,
32 |     do_sample=True,        # Activate sampling
33 |     top_k=1,               # Use max sampling (greedy sampling with randomness)
34 |     temperature=1.0        # Optional: Control randomness. Higher values give more diversity.
35 | )
36 | print(output)
37 | output_text = tokenizer.decode(output[0], skip_special_tokens=True)
38 | print(output_text)
39 | print("input_ids", input_ids)
40 | 


--------------------------------------------------------------------------------
/groundtruth/tokenize1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | 
 4 | # Set the HF_HOME environment variable
 5 | os.environ['HF_HOME'] = '/code/hf'
 6 | from transformers import  AutoTokenizer
 7 | # import torch
 8 | 
 9 | 
10 | 
11 | model_name_or_path = "meta-llama/Llama-2-70b-chat-hf"
12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
13 | 
14 | 
15 | # Create a partial model with a fraction of layers, e.g., the first 12 layers
16 | 
17 | 
18 | # Prepare input
19 | input_text = "To plan the visit to Seattle, you need to "
20 | inputs = tokenizer(input_text, return_tensors="pt")
21 | 
22 | print(inputs)


--------------------------------------------------------------------------------
/installAnaconda.sh:
--------------------------------------------------------------------------------
1 | wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh
2 | chmod +x ./Anaconda3-2024.02-1-Linux-x86_64.sh
3 | ./Anaconda3-2024.02-1-Linux-x86_64.sh
4 | 
5 | echo "export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH" >> ~/.bashrc


--------------------------------------------------------------------------------
/modelDownload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | current_dir=$(pwd)
 4 | parentdir="$(dirname "$current_dir")"
 5 | mkdir -p $parentdir/hf
 6 | 
 7 | export HF_HOME=$parentdir/hf
 8 | 
 9 | huggingface-cli login
10 | 
11 | cd groundtruth
12 | python test.py
13 | 
14 | cd ../pipeline/utils
15 | python weightSaver.py $parentdir/hf
16 | 
17 | 


--------------------------------------------------------------------------------
/perf.sh:
--------------------------------------------------------------------------------
1 | cd datasets
2 | ./gen.sh
3 | cd ..
4 | cd pipeline/eval
5 | ./run_all.sh
6 | 


--------------------------------------------------------------------------------
/pipeline/config_all/.gitignore:
--------------------------------------------------------------------------------
1 | !*.json


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
40 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
42 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
44 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |             "global_batch_size": 1024,
49 |             "nanobatch_1_size": 384,
50 |             "kqv1_size": 256,
51 |             "kqv3_size": 256
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
40 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
41 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
42 |             "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor",
43 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
44 |             "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor",
45 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |             "global_batch_size": 768,
49 |             "nanobatch_1_size": 384,
50 |             "kqv1_size": 128,
51 |             "kqv3_size": 128
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/correct_40G/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 400,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/correct_40G/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 400,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "NANOBATCH"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/correct_40G/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 400,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 
39 | 		    "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",  
40 | 		    "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",      
41 | 		    "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
42 |             ],
43 |         "global_batch_size": 2048,
44 |         "nanobatch_1_size": 640,
45 |         "kqv1_size": 256,
46 |         "kqv3_size": 768
47 |     },
48 |     "serve_configs": {
49 |         "model": "meta-llama/Llama-2-70b-chat-hf",
50 |         "actual_gpu_num": 8,
51 |         "weight_path": "./nanoflow_weight/",
52 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
53 |         "pipeline_type": "NON_OVERLAP"
54 |     }
55 |   }
56 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/fewer_layers/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 5, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
40 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
42 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
44 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |             "global_batch_size": 1024,
49 |             "nanobatch_1_size": 384,
50 |             "kqv1_size": 256,
51 |             "kqv3_size": 256
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/fewer_layers/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 5, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/fewer_layers/768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 5, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
40 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
41 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
42 |             "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor",
43 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
44 |             "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor",
45 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |             "global_batch_size": 768,
49 |             "nanobatch_1_size": 384,
50 |             "kqv1_size": 128,
51 |             "kqv3_size": 128
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/fewer_layers/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 5, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "NANOBATCH"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/fewer_layers/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 5, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "NON_OVERLAP"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "NANOBATCH"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "NON_OVERLAP"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama2-70B/pllm-offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "bos_token_id": 1,
 7 |         "eos_token_id": 2,
 8 |         "hidden_act": "silu",
 9 |         "hidden_size": 8192,
10 |         "initializer_range": 0.02,
11 |         "intermediate_size": 28672,
12 |         "max_position_embeddings": 4096,
13 |         "model_type": "llama",
14 |         "num_attention_heads": 64,
15 |         "num_hidden_layers": 80, 
16 |         "num_key_value_heads": 8,
17 |         "pretraining_tp": 1,
18 |         "rms_norm_eps": 1e-05,
19 |         "rope_theta": 10000.0,
20 |         "rope_scaling": null,
21 |         "tie_word_embeddings": false,
22 |         "torch_dtype": "float16",
23 |         "transformers_version": "4.31.0.dev0",
24 |         "use_cache": true,
25 |         "vocab_size": 32000
26 |     },
27 |     "model_configs": {
28 |         "gpu_num": 8,
29 |         "run_layer": 80,
30 |         "allocate_kv_data_batch": 1480,
31 |         "frame_page_size": 16,
32 |         "max_batch_size": 2048,
33 |         "gpu_mem": 68719476736,
34 |         "page_mem_size": 32768
35 |     },
36 |     "pipeline_configs": {
37 |         "gemm_op_tag": [
38 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
39 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
40 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
47 |             ],
48 |         "global_batch_size": 2048,
49 |         "nanobatch_1_size": 640,
50 |         "kqv1_size": 256,
51 |         "kqv3_size": 768
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Llama-2-70b-chat-hf",
55 |         "actual_gpu_num": 8,
56 |         "weight_path": "./nanoflow_weight/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080",
58 |         "pipeline_type": "PLLM_OFFLOAD"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
44 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
47 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
48 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |             "global_batch_size": 1024,
51 |             "nanobatch_1_size": 384,
52 |             "kqv1_size": 256,
53 |             "kqv3_size": 256
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
42 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
43 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
44 |             "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor",
45 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
46 |             "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor",
47 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
48 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |             "global_batch_size": 768,
51 |             "nanobatch_1_size": 384,
52 |             "kqv1_size": 128,
53 |             "kqv3_size": 128
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/correct_40G/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 400,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/correct_40G/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 400,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "NANOBATCH"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/correct_40G/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 400,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "NON_OVERLAP"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/fewer_layers/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
44 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
46 |             "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor",
47 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
48 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |             "global_batch_size": 1024,
51 |             "nanobatch_1_size": 384,
52 |             "kqv1_size": 256,
53 |             "kqv3_size": 256
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/fewer_layers/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/fewer_layers/768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor",
42 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
43 |             "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor",
44 |             "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor",
45 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
46 |             "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor",
47 |             "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",
48 |             "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |             "global_batch_size": 768,
51 |             "nanobatch_1_size": 384,
52 |             "kqv1_size": 128,
53 |             "kqv3_size": 128
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/fewer_layers/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "NANOBATCH"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/fewer_layers/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "NON_OVERLAP"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "NANOBATCH"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-70B/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 8192,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 28672,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |             ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "meta-llama/Meta-Llama-3-70B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_70B_3/",
59 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae",
60 |         "pipeline_type": "NON_OVERLAP"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/correct_40G/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 100,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/correct_40G/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 100,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NANOBATCH_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/correct_40G/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 100,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NON_OVERLAP_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/fewer_layers/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 4,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 500,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/fewer_layers/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 4,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NANOBATCH_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/fewer_layers/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 4,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NON_OVERLAP_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NANOBATCH_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3-8B/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 
43 |         "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
44 |         ],
45 |         "global_batch_size": 1024,
46 |         "nanobatch_1_size": 384,
47 |         "kqv1_size": 384,
48 |         "kqv3_size": 640
49 |     },
50 |     "serve_configs": {
51 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
52 |         "actual_gpu_num": 1,
53 |         "weight_path": "./nanoflow_weight_8B/",
54 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
55 |         "pipeline_type": "NON_OVERLAP_LOCAL"
56 |     }
57 |   }
58 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 32,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 480,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/correct_40G/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 32,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 100,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/correct_40G/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 32,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 100,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "NANOBATCH_LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/correct_40G/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 32,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 100,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "NON_OVERLAP_LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/fewer_layers/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 4,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 480,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/fewer_layers/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 4,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 480,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "NANOBATCH_LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/fewer_layers/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 4,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 480,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "NON_OVERLAP_LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 32,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 480,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "NANOBATCH_LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/llama3.1-8B/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |         "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": [
10 |         128001,
11 |         128008,
12 |         128009
13 |         ],
14 |         "hidden_act": "silu",
15 |         "hidden_size": 4096,
16 |         "initializer_range": 0.02,
17 |         "intermediate_size": 14336,
18 |         "max_position_embeddings": 131072,
19 |         "mlp_bias": false,
20 |         "model_type": "llama",
21 |         "num_attention_heads": 32,
22 |         "num_hidden_layers": 32,
23 |         "num_key_value_heads": 8,
24 |         "pretraining_tp": 1,
25 |         "rms_norm_eps": 1e-05,
26 |         "rope_scaling": {
27 |         "factor": 8.0,
28 |         "low_freq_factor": 1.0,
29 |         "high_freq_factor": 4.0,
30 |         "original_max_position_embeddings": 8192,
31 |         "rope_type": "llama3"
32 |         },
33 |         "rope_theta": 500000.0,
34 |         "tie_word_embeddings": false,
35 |         "torch_dtype": "bfloat16",
36 |         "transformers_version": "4.42.3",
37 |         "use_cache": true,
38 |         "vocab_size": 128256
39 |     },
40 |     "model_configs": {
41 |         "gpu_num": 1,
42 |         "run_layer": 32,
43 |         "allocate_kv_data_batch": 480,
44 |         "frame_page_size": 16,
45 |         "max_batch_size": 2048,
46 |         "gpu_mem": 68719476736,
47 |         "page_mem_size": 32768
48 |     },
49 |     "pipeline_configs": {
50 |         "gemm_op_tag": [
51 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
52 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
53 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
54 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
55 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
56 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
57 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
58 |         ],
59 |         "global_batch_size": 1024,
60 |         "nanobatch_1_size": 384,
61 |         "kqv1_size": 384,
62 |         "kqv3_size": 640
63 |     },
64 |     "serve_configs": {
65 |         "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66 |         "actual_gpu_num": 1,
67 |         "weight_path": "./nanoflow_weight_8B_3_1/",
68 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f",
69 |         "pipeline_type": "NON_OVERLAP_LOCAL"
70 |     }
71 |   }
72 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "MixtralForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 1,
 8 |         "eos_token_id": 2,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 4096,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 14336,
13 |         "max_position_embeddings": 32768,
14 |         "model_type": "mixtral",
15 |         "num_attention_heads": 32,
16 |         "num_experts_per_tok": 2,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "num_local_experts": 8,
20 |         "output_router_logits": false,
21 |         "rms_norm_eps": 1e-05,
22 |         "rope_theta": 1000000.0,
23 |         "router_aux_loss_coef": 0.02,
24 |         "sliding_window": null,
25 |         "tie_word_embeddings": false,
26 |         "torch_dtype": "bfloat16",
27 |         "transformers_version": "4.36.0.dev0",
28 |         "use_cache": true,
29 |         "vocab_size": 32000
30 |       },
31 |     "model_configs": {
32 |         "gpu_num": 8,
33 |         "run_layer": 32,
34 |         "allocate_kv_data_batch": 3072,
35 |         "frame_page_size": 16,
36 |         "max_batch_size": 6144,
37 |         "gpu_mem": 68719476736,
38 |         "page_mem_size": 32768
39 |     },
40 |     "pipeline_configs": {
41 |         "gemm_op_tag": [
42 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
47 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
48 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 4096,
51 |         "nanobatch_1_size": 2048,
52 |         "kqv1_size": 1024,
53 |         "kqv3_size": 1024
54 |     },
55 |     "serve_configs": {
56 |         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_mixtral_8-7B/",
59 |         "hf_path": "../../../hf/hub/models--mistralai--Mixtral-8x7B-Instruct-v0.1/snapshots/41bd4c9e7e4fb318ca40e721131d4933966c2cc1",
60 |         "pipeline_type": "PLLM"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/correct_40G/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 100,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/correct_40G/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 100,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NANOBATCH_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/correct_40G/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 100,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NON_OVERLAP_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/fewer_layers/1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 4,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 500,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/fewer_layers/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 4,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NANOBATCH_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/fewer_layers/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 4,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NON_OVERLAP_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |         "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
44 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
45 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
46 | 		"128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
47 |         ],
48 |         "global_batch_size": 1024,
49 |         "nanobatch_1_size": 384,
50 |         "kqv1_size": 384,
51 |         "kqv3_size": 640
52 |     },
53 |     "serve_configs": {
54 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
55 |         "actual_gpu_num": 1,
56 |         "weight_path": "./nanoflow_weight_8B/",
57 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
58 |         "pipeline_type": "NANOBATCH_LOCAL"
59 |     }
60 |   }
61 |      


--------------------------------------------------------------------------------
/pipeline/config_all/mixtral-8-7B/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "LlamaForCausalLM"
 5 |         ],
 6 |         "attention_bias": false,
 7 |         "attention_dropout": 0.0,
 8 |         "bos_token_id": 128000,
 9 |         "eos_token_id": 128009,
10 |         "hidden_act": "silu",
11 |         "hidden_size": 4096,
12 |         "initializer_range": 0.02,
13 |         "intermediate_size": 14336,
14 |         "max_position_embeddings": 8192,
15 |         "model_type": "llama",
16 |         "num_attention_heads": 32,
17 |         "num_hidden_layers": 32,
18 |         "num_key_value_heads": 8,
19 |         "pretraining_tp": 1,
20 |         "rms_norm_eps": 1e-05,
21 |         "rope_scaling": null,
22 |         "rope_theta": 500000.0,
23 |         "tie_word_embeddings": false,
24 |         "torch_dtype": "bfloat16",
25 |         "transformers_version": "4.40.0.dev0",
26 |         "use_cache": true,
27 |         "vocab_size": 128256
28 |     },
29 |     "model_configs": {
30 |         "gpu_num": 1,
31 |         "run_layer": 32,
32 |         "allocate_kv_data_batch": 480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
41 |         "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
42 |         "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 
43 |         "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor"
44 |         ],
45 |         "global_batch_size": 1024,
46 |         "nanobatch_1_size": 384,
47 |         "kqv1_size": 384,
48 |         "kqv3_size": 640
49 |     },
50 |     "serve_configs": {
51 |         "model": "meta-llama/Meta-Llama-3-8B-Instruct",
52 |         "actual_gpu_num": 1,
53 |         "weight_path": "./nanoflow_weight_8B/",
54 |         "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa",
55 |         "pipeline_type": "NON_OVERLAP_LOCAL"
56 |     }
57 |   }
58 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1382,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "KQVBIAS"
61 |     }
62 |   } 
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/correct_40G/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |             "Qwen2ForCausalLM"
 5 |           ],
 6 |           "attention_dropout": 0.0,
 7 |           "bos_token_id": 151643,
 8 |           "eos_token_id": 151645,
 9 |           "hidden_act": "silu",
10 |           "hidden_size": 8192,
11 |           "initializer_range": 0.02,
12 |           "intermediate_size": 29568,
13 |           "max_position_embeddings": 32768,
14 |           "max_window_layers": 80,
15 |           "model_type": "qwen2",
16 |           "num_attention_heads": 64,
17 |           "num_hidden_layers": 80,
18 |           "num_key_value_heads": 8,
19 |           "rms_norm_eps": 1e-06,
20 |           "rope_theta": 1000000.0,
21 |           "sliding_window": 131072,
22 |           "tie_word_embeddings": false,
23 |           "torch_dtype": "bfloat16",
24 |           "transformers_version": "4.40.1",
25 |           "use_cache": true,
26 |           "use_sliding_window": false,
27 |           "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 200,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/correct_40G/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 200,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "NANOBATCH_KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/correct_40G/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 200,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "NONOVERLAP_KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/fewer_layers/2048.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "KQVBIAS"
61 |     }
62 |   }
63 | 


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/fewer_layers/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "NANOBATCH_KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/fewer_layers/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 5,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1480,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |             "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "NONOVERLAP_KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/nanobatch-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1300,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "NANOBATCH_KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/config_all/qwen2-72B/non-overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "architectures": [
 4 |           "Qwen2ForCausalLM"
 5 |         ],
 6 |         "attention_dropout": 0.0,
 7 |         "bos_token_id": 151643,
 8 |         "eos_token_id": 151645,
 9 |         "hidden_act": "silu",
10 |         "hidden_size": 8192,
11 |         "initializer_range": 0.02,
12 |         "intermediate_size": 29568,
13 |         "max_position_embeddings": 32768,
14 |         "max_window_layers": 80,
15 |         "model_type": "qwen2",
16 |         "num_attention_heads": 64,
17 |         "num_hidden_layers": 80,
18 |         "num_key_value_heads": 8,
19 |         "rms_norm_eps": 1e-06,
20 |         "rope_theta": 1000000.0,
21 |         "sliding_window": 131072,
22 |         "tie_word_embeddings": false,
23 |         "torch_dtype": "bfloat16",
24 |         "transformers_version": "4.40.1",
25 |         "use_cache": true,
26 |         "use_sliding_window": false,
27 |         "vocab_size": 152064
28 |       },
29 |     "model_configs": {
30 |         "gpu_num": 8,
31 |         "run_layer": 80,
32 |         "allocate_kv_data_batch": 1300,
33 |         "frame_page_size": 16,
34 |         "max_batch_size": 2048,
35 |         "gpu_mem": 68719476736,
36 |         "page_mem_size": 32768
37 |     },
38 |     "pipeline_configs": {
39 |         "gemm_op_tag": [
40 |     	"128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor",
41 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
42 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
43 |             "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",
44 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
45 |             "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",
46 |             "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",
47 |             "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",
48 |             "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"
49 |         ],
50 |         "global_batch_size": 2048,
51 |         "nanobatch_1_size": 640,
52 |         "kqv1_size": 256,
53 |         "kqv3_size": 768
54 |     },
55 |     "serve_configs": {
56 |         "model": "Qwen/Qwen2-72B-Instruct",
57 |         "actual_gpu_num": 8,
58 |         "weight_path": "./nanoflow_weight_qwen2-72B/",
59 |         "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1",
60 |         "pipeline_type": "NONOVERLAP_KQVBIAS"
61 |     }
62 |   }
63 |      


--------------------------------------------------------------------------------
/pipeline/eval/.gitignore:
--------------------------------------------------------------------------------
1 | *.ret
2 | *.log
3 | */results/*


--------------------------------------------------------------------------------
/pipeline/eval/clean_all.sh:
--------------------------------------------------------------------------------
 1 | current_dir=$(pwd)
 2 | 
 3 | # get all folders start with "eval-" in the current directory
 4 | 
 5 | eval_folders=$(find . -maxdepth 1 -type d -name "eval-*")
 6 | 
 7 | for eval_folder in ${eval_folders[@]}; do
 8 |     echo "Running $eval_folder"
 9 |     cd $eval_folder
10 | 
11 |     ./clean.sh
12 | 
13 |     cd $current_dir
14 | done


--------------------------------------------------------------------------------
/pipeline/eval/eval-ablation/clean.sh:
--------------------------------------------------------------------------------
1 | rm -r ./results/*


--------------------------------------------------------------------------------
/pipeline/eval/eval-fix-offline/clean.sh:
--------------------------------------------------------------------------------
1 | rm -r ./results/*


--------------------------------------------------------------------------------
/pipeline/eval/eval-fix-offline/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | import argparse
 5 | 
 6 | def main():
 7 |     # Ensure the script is called with the correct number of arguments
 8 |     arg_parser = argparse.ArgumentParser()
 9 |     arg_parser.add_argument("--trace_base", type=str, default="../../../datasets/traces", help="The base directory containing the traces.")
10 |     arg_parser.add_argument("--executor_base", type=str, default="../../utils", help="The base directory containing the executor.")  
11 |     arg_parse = arg_parser.parse_args()
12 |     
13 |     current_dir = os.getcwd()
14 |     trace_base = os.path.abspath(arg_parse.trace_base)
15 |     executor_base =  arg_parse.executor_base
16 |     result_base = os.path.join(current_dir, "results")
17 | 
18 |     fix_trace = os.path.join(trace_base, "fixed")
19 | 
20 |     # Create the result directory if it doesn't exist
21 |     os.makedirs(result_base, exist_ok=True)
22 | 
23 |     # Loop through each trace file in the fixed trace directory
24 |     for trace in os.listdir(fix_trace):
25 |         trace_path = os.path.join(fix_trace, trace)
26 |         print(f"Running offline throughput experiment trace: {trace_path}")
27 |         
28 |         base_trace_name = os.path.splitext(trace)[0]  # Get the base name without extension
29 |         parts = base_trace_name.split('-')  # Split by '-'
30 |         input_len = parts[0]
31 |         output_len = parts[1]
32 |         rate = parts[2]
33 | 
34 |         log_file = os.path.join(result_base, f"{input_len}-{output_len}-{rate}.log")
35 |         result_file = os.path.join(result_base, f"{input_len}-{output_len}-{rate}.stat.csv")
36 |         # Check if the output file already exists
37 |         if os.path.isfile(result_file):
38 |             print(f"Offline throughput experiment input_len: {input_len}, output_len: {output_len}, rate: {rate} already exists. Skipping...")
39 |             continue
40 | 
41 |         # Construct the command
42 |         command = [
43 |             "python", "serve_8B.py",
44 |             f"--config_path=../config_all/llama2-70B/2048.json",
45 |             f"--trace_path={trace_path}",
46 |             f"--output_prefix={os.path.join(result_base, f'{input_len}-{output_len}-{rate}')}",
47 |             f"--skip_cycles={2000 if int(output_len) < 10 else 10000}",
48 |             f"--empty_weight=True",
49 |             f"--run_cycles=1500"
50 |         ]
51 | 
52 |         # Execute the command and log the output
53 |         with open(log_file, "w") as log:
54 |             subprocess.run(command, cwd=executor_base, stdout=log, stderr=log)
55 | 
56 | if __name__ == "__main__":
57 |     main()
58 | 


--------------------------------------------------------------------------------
/pipeline/eval/eval-real-offline/clean.sh:
--------------------------------------------------------------------------------
1 | rm -r ./results/*


--------------------------------------------------------------------------------
/pipeline/eval/eval-real-online-1024/clean.sh:
--------------------------------------------------------------------------------
1 | rm -r ./results/*


--------------------------------------------------------------------------------
/pipeline/eval/eval-real-online-2048/auto_datapoint.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def f(x):
 4 |     return x**2 +10 # Example function, replace with your actual function
 5 | 
 6 | def binary_search_f(threshold):
 7 |     low = 1
 8 |     high = 50
 9 |     f0 = f(0)
10 |     intermediate_points = []
11 | 
12 |     while low <= high:
13 |         mid = (low + high) // 2
14 |         f_mid = f(mid)
15 |         intermediate_points.append((mid, f_mid))
16 |         
17 |         if f_mid - f0 > threshold:
18 |             high = mid - 1
19 |         else:
20 |             low = mid + 1
21 | 
22 |     return low, intermediate_points
23 | 
24 | def evaluate_larger_x(starting_x, step_size, threshold):
25 |     x = starting_x
26 |     f0 = f(0)
27 |     larger_x_points = []
28 | 
29 |     while f(x) <= 4 * f0:
30 |         f_x = f(x)
31 |         larger_x_points.append((x, f_x))
32 |         x += step_size
33 | 
34 |     return larger_x_points
35 | 
36 | threshold = 10
37 | result, intermediate_points = binary_search_f(threshold)
38 | 
39 | # Calculate the step size as the range of intermediate points divided by 5
40 | step_size = (max(intermediate_points, key=lambda point: point[0])[0] - min(intermediate_points, key=lambda point: point[0])[0]) // 5
41 | 
42 | # Evaluate larger x values until f(x) > 4 * f(0)
43 | larger_x_points = evaluate_larger_x(result, step_size, 4 * f(0))
44 | 
45 | # Save intermediate points and larger x points to dataframes
46 | df_intermediate = pd.DataFrame(intermediate_points, columns=['x', 'f(x)'])
47 | df_larger_x = pd.DataFrame(larger_x_points, columns=['x', 'f(x)'])
48 | 
49 | # Save the dataframes to CSV files
50 | df_intermediate.to_csv('intermediate_points.csv', index=False)
51 | df_larger_x.to_csv('larger_x_points.csv', index=False)
52 | 
53 | # Print the results
54 | print(f"The first point where f(x) - f(0) > {threshold} is at x = {result}")
55 | print("Intermediate points:")
56 | print(df_intermediate)
57 | print("Larger x points:")
58 | print(df_larger_x)
59 | 


--------------------------------------------------------------------------------
/pipeline/eval/eval-real-online-2048/clean.sh:
--------------------------------------------------------------------------------
1 | rm -r ./results/*


--------------------------------------------------------------------------------
/pipeline/eval/eval-real-online-768/clean.sh:
--------------------------------------------------------------------------------
1 | rm -r ./results/*


--------------------------------------------------------------------------------
/pipeline/eval/eval_output_example.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | example_json = {
 3 |     # offline throughput with const input and output len
 4 |     "offline_throughput_const": {
 5 |         "512-512": 1036,
 6 |         "1024-512": 1055,
 7 |         "512-1024": 1043,
 8 |     },
 9 |     # offline throughput with input and output len from real trace
10 |     "offline_throughput_real_trace": {
11 |         "Splitwise": 1048,
12 |         "LMSYS-Chat": 990,
13 |         "ShareGPT": 936,
14 |     },
15 |     # online throughput
16 |     "online_throughput": {
17 |         "Splitwise": {
18 |             "request_rate": [5, 10, 15, 20],
19 |             "normalized_latency": [0.2, 0.4, 0.6, 0.8],
20 |         },
21 |         "LMSYS-Chat": {
22 |             "request_rate": [5, 10, 15, 20],
23 |             "normalized_latency": [0.2, 0.4, 0.6, 0.8],
24 |         },
25 |         "ShareGPT": {
26 |             "request_rate": [5, 10, 15, 20],
27 |             "normalized_latency": [0.2, 0.4, 0.6, 0.8],
28 |         }
29 |     }
30 | }
31 | 
32 | print(json.dumps(example_json, indent=4))


--------------------------------------------------------------------------------
/pipeline/eval/merge_results.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import os
 4 | import json
 5 | import sys
 6 | 
 7 | # read first argument as resdir
 8 | resdir = sys.argv[1]
 9 | 
10 | json_res = {
11 |     # offline throughput with const input and output len
12 |     "offline_throughput_const": {
13 |         "512-512": 0,
14 |         "1024-512": 0,
15 |         "512-1024": 0,
16 |     },
17 |     # offline throughput with input and output len from real trace
18 |     "offline_throughput_real_trace": {
19 |         "splitwise": 0,
20 |         "lmsys": 0,
21 |         "sharegpt": 0,
22 |     },
23 |     # online throughput
24 |     "online_throughput": {
25 |         "splitwise": {
26 |             "request_rate": [],
27 |             "normalized_latency": [],
28 |         },
29 |         "lmsys": {
30 |             "request_rate": [],
31 |             "normalized_latency": [],
32 |         },
33 |         "sharegpt": {
34 |             "request_rate": [],
35 |             "normalized_latency": [],
36 |         }
37 |     }
38 | }
39 | 
40 | 
41 | # column format: total_time,token_per_second,token_per_second_per_gpu,total_cycle,cycle_time,average_ttft,average_tpot,average_normalize_latency
42 | 
43 | for name in ["512-512", "1024-512", "512-1024"]:
44 |     df = pd.read_csv(f"{resdir}/offline-{name}.csv")
45 |     # get token_per_second_per_gpu
46 |     json_res["offline_throughput_const"][name] = df["token_per_second_per_gpu"][0].astype(float)
47 | 
48 | for name in ["splitwise", "lmsys", "sharegpt"]:
49 |     df = pd.read_csv(f"{resdir}/offline-{name}.csv")
50 |     # get token_per_second_per_gpu
51 |     json_res["offline_throughput_real_trace"][name] = df["token_per_second_per_gpu"][0].astype(float)
52 |     
53 | for name in ["splitwise", "lmsys", "sharegpt"]:
54 |     # list all files start with online-{name}- and end with .csv
55 |     files = [f for f in os.listdir(resdir) if f.startswith(f"online-{name}-") and f.endswith(".csv")]
56 |     for file in files:
57 |         df = pd.read_csv(f"{resdir}/{file}")
58 |         request_rate = int(file.split("-")[2].split(".")[0])
59 |         json_res["online_throughput"][name]["request_rate"].append(request_rate)
60 |         json_res["online_throughput"][name]["normalized_latency"].append(df["average_normalize_latency"][0])
61 |         
62 | print(json_res)
63 | # save json_res to a file
64 | json.dump(json_res, open(f"{resdir}/pllm.json", 'w'))
65 | 


--------------------------------------------------------------------------------
/pipeline/eval/run_all.sh:
--------------------------------------------------------------------------------
 1 | current_dir=$(pwd)
 2 | 
 3 | # get all folders start with "eval-" in the current directory
 4 | 
 5 | cd ../../datasets/
 6 | ./gen.sh
 7 | cd $current_dir
 8 | 
 9 | eval_folders=$(find . -maxdepth 1 -type d -name "eval-*")
10 | 
11 | for eval_folder in ${eval_folders[@]}; do
12 |     echo "Running $eval_folder"
13 |     cd $eval_folder
14 | 
15 |     python run.py --trace_base ../../../datasets/traces  --executor_base ../../utils/
16 | 
17 |     cd $current_dir
18 | done
19 | 
20 | python baseline_data.py .
21 | python summarize.py
22 | python plot_all.py


--------------------------------------------------------------------------------
/pipeline/include/computeBound.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <iostream>
 4 | #include <vector>
 5 | 
 6 | 
 7 | #include "gemmShape.cuh"
 8 | #include "pipeline.h"
 9 | #include "networkManager.cuh"
10 | #include "tensorManager.cuh"
11 | 
12 | 
13 | #include "config.h"
14 | #include "vortexData.cuh"
15 | 
16 | extern std::vector<vortexOutputData> outputs;
17 | extern std::vector<int> aggregated_output;
18 | 
19 | 
20 | class Worker {
21 | public:
22 | 	enum class PipelineType
23 | 	{
24 | 		PLLM,
25 | 		NONOVERLAP,
26 | 		NANOBATCH,
27 | 		PLLMOFFLOAD,
28 | 		KQVBIAS,
29 | 		NONOVERLAP_KQVBIAS,
30 | 		NANOBATCH_KQVBIAS,
31 | 		LOCAL,
32 | 		NON_OVERLAP_LOCAL,
33 | 		NANOBATCH_LOCAL,
34 | 	};
35 | 	static PipelineType PipeTy;
36 | 
37 | private:
38 | 	int rank;
39 | 	int nranks;
40 | 	int vnranks;
41 | 	vortexInitData* input;
42 | 	vortexOutputData* output;
43 | 	std::unique_ptr<PipelineBase> pipeline;
44 | 	std::unique_ptr<std::thread> thread;
45 | 	void thread_entry();
46 | 
47 | public:
48 | 	Worker(int rank,
49 | 				   int nranks,
50 | 				   int vnranks,
51 | 				   vortexInitData* input,
52 | 				   vortexOutputData* output)
53 | 		: rank(rank)
54 | 		, nranks(nranks)
55 | 		, vnranks(vnranks)
56 | 		, input(input)
57 | 		, output(output) { }
58 | 	void init();
59 | 	void as_thread(int core);
60 | 	void join() { if (thread) thread->join(); }
61 | 	void run_pipeline();
62 | 	void run_update(vortexUpdateData* update_data) {
63 | 		vortexUpdateData& gpu_update_data = TensorManager::getInstance().update_data_to_gpu(*update_data, rank);
64 | 		pipeline->update(&gpu_update_data);
65 | 	}
66 | 	void run_config(vortexConfigData* config_data) {
67 | 		pipeline->config(config_data);
68 | 	}
69 | 	vortexOutputData* getOutput() {
70 | 		return output;
71 | 	}
72 | 	~Worker() { if (thread) thread->join(); }
73 | };
74 | 
75 | 
76 | void run();
77 | // vnranks >= nranks. virtualized ranks will not touch any GPU resources, but will take use random data to participate in collective communication buffers.
78 | void init(int nranks, int vnranks, std::vector<vortexInitData>& input, std::vector<vortexOutputData>& output, Worker::PipelineType pipeTy = Worker::PipelineType::PLLM);
79 | inline void init(int nranks, std::vector<vortexInitData>& input, std::vector<vortexOutputData>& output, Worker::PipelineType pipeTy = Worker::PipelineType::PLLM) {
80 | 	init(nranks, nranks, input, output, pipeTy);
81 | }
82 | 
83 | void update(int nranks, std::vector<vortexUpdateData>& update);
84 | void finalize();
85 | void run_async();
86 | void run_async_wait();
87 | void config(int nranks, std::vector<vortexConfigData>& config);


--------------------------------------------------------------------------------
/pipeline/include/cutlassGemmWrapper.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "cutlassGemmWrapperImpl.cuh"
4 | using ColumnMajor = cutlass::layout::ColumnMajor;
5 | using RowMajor = cutlass::layout::RowMajor;


--------------------------------------------------------------------------------
/pipeline/include/eventManager.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include "helper.h"
 4 | #include <vector>
 5 | 
 6 | class EventManager {
 7 | public:
 8 | 	enum EVENT_NAME
 9 | 	{
10 | 		GEMM_TIMING_START = 0,
11 | 		GEMV_TIMING_START,
12 | 		NET_TIMING_START,
13 | 		GEMV_TIMING_END,
14 | 		GEMM_TIMING_END,
15 | 		GEMM_TIMING_JOIN,
16 | 		O1_FINISH,
17 | 		AG_O1_START,
18 | 		AG_O1_FINISH,
19 | 		O2_FINISH,
20 | 		AG_O2_FINISH,
21 | 		AG_O2_START,
22 | 		UG1_FINISH,
23 | 		D1_FINISH,
24 | 		UG2_FINISH,
25 | 		D2_FINISH,
26 | 		AG_D1_FINISH,
27 | 		KQV1_FINISH,
28 | 		KQV1_ROPE_START,
29 | 		KQV2_FINISH,
30 | 		KQV2_ROPE_START,
31 | 		KQV3_FINISH,
32 | 		KQV3_ROPE_START,
33 | 		KQV4_FINISH,
34 | 		KQV4_ROPE_START,
35 | 		GEMV1_FINISH,
36 | 		GEMV2_FINISH,
37 | 		GEMV3_FINISH,
38 | 		GEMV4_FINISH,
39 | 		AR1_FINISH,
40 | 		AR2_FINISH,
41 | 		AG1_GEMV_FINISH,
42 | 		AG2_GEMV_FINISH,
43 | 		CAPTURE_GEMM_START,
44 | 		CAPTURE_GEMV_END,
45 | 		CAPTURE_NET_END,
46 | 		LN_MODEL1_FINISH,
47 | 		LN_MODEL2_FINISH,
48 | 		LOGITS1_FINISH,
49 | 		LOGITS2_FINISH,
50 | 		AG_LOGITS1_FINISH,
51 | 		AG_LOGITS2_FINISH,
52 | 		NUM
53 | 	};
54 | 	constexpr static int NUM_EVENTS = NUM + 1;
55 | 	std::vector<cudaEvent_t> events;
56 | 
57 | 	EventManager()
58 | 		: events(NUM_EVENTS) {
59 | 		for(int i = 0; i < NUM_EVENTS; i++) {
60 | 			CUDA_CHECK(cudaEventCreate(&events[i]));
61 | 		}
62 | 	}
63 | 
64 | 	~EventManager() {
65 | 		for(int i = 0; i < NUM_EVENTS; i++) {
66 | 			CUDA_CHECK(cudaEventDestroy(events[i]));
67 | 		}
68 | 	}
69 | };


--------------------------------------------------------------------------------
/pipeline/include/gemmFactory.cuh:
--------------------------------------------------------------------------------
1 | #include "cutlassGemmWrapper.cuh"
2 | #include <string>
3 | BaseGEMMWrapper* generateGEMM(std::string tag);
4 | 


--------------------------------------------------------------------------------
/pipeline/include/gemmShape.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "cutlassGemmWrapper.cuh"
 3 | #include <string>
 4 | 
 5 | // The canonical name for cutlassGemmWrapper template parameters:
 6 | //  cta_m, cta_n, cta_k, warp_m, warp_n, warp_k, split_k, stages, A_major, B_major, O_major
 7 | // static constexpr std::array gemmConfig = {
 8 | //     "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", // O1
 9 | //     "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", // O2
10 | //     "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor",    // UG1
11 | //     "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",    // D1
12 | //     "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor",    // UG2
13 | //     "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor",    // D2
14 | //     "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",        // KQV1
15 | //     "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor",       // KQV2
16 | //     "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor",       // KQV3
17 | //     "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor",       // KQV4
18 | //     "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor",       // LOGITS1
19 | //     "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor"        // LOGITS2
20 | // };
21 | 
22 | // enum class GEMM_NAME {
23 | //     O1=0,
24 | //     O2,
25 | //     UG1,
26 | //     D1,
27 | //     UG2,
28 | //     D2,
29 | //     KQV1,
30 | //     KQV2,
31 | //     KQV3,
32 | //     KQV4,
33 | //     LOGITS1,
34 | //     LOGITS2,
35 | //     NUM
36 | // };
37 | 
38 | // constexpr int gemmNum = static_cast<int>(GEMM_NAME::NUM);
39 | // constexpr int gemvNum = 4;


--------------------------------------------------------------------------------
/pipeline/include/gemvConfig.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <config.h>
3 | 
4 | // constexpr int GEMV_BATCH_SIZE[] = {0,0,0,0};
5 | // constexpr int GEMV_BLOCK_NUM[] = {10,10,10,10};


--------------------------------------------------------------------------------
/pipeline/include/gemvDependency.cuh:
--------------------------------------------------------------------------------
 1 | #include "config.h"
 2 | 
 3 | #include <cuda_runtime.h>
 4 | 
 5 | 
 6 | __global__ void setReadyKernel(int * flag, int batch);
 7 | 
 8 | __global__ void clearReadyKernel(int * flag);
 9 | 
10 | __global__ void waitReadyKernel(int * flag, int desired_batch);
11 | 
12 | class gemvDependency {
13 | public:
14 |     int * device_KQV_ready;
15 |     int * device_GEMV_ready;
16 |     // Constructor
17 |     gemvDependency() {
18 |         cudaMalloc(&device_KQV_ready, sizeof(int));
19 |         cudaMalloc(&device_GEMV_ready, sizeof(int));
20 |         cudaMemset(device_KQV_ready, 0, sizeof(int));
21 |         cudaMemset(device_GEMV_ready, 0, sizeof(int));
22 |     }
23 | 
24 |     // Destructor
25 |     ~gemvDependency() {
26 |         cudaFree(device_KQV_ready);
27 |         cudaFree(device_GEMV_ready);
28 |     }
29 | 
30 |     // Method to clear all flags
31 |     void clearAll(cudaStream_t stream) {
32 |         clearReadyKernel<<<1, 1, 0, stream>>>(device_KQV_ready);
33 |         clearReadyKernel<<<1, 1, 0, stream>>>(device_GEMV_ready);
34 |     }
35 | 
36 |     void incCounter(int* counter, int num, cudaStream_t stream) {
37 |         setReadyKernel<<<1, 1, 0, stream>>>(counter, num);
38 |     }
39 | 
40 |     // Method to block until GEMV is ready
41 |     void blockUntilGEMVReady(cudaStream_t stream, int desired_batch) {
42 |         waitReadyKernel<<<1, 1, 0, stream>>>(device_GEMV_ready, desired_batch);
43 |     }
44 | 
45 | private:
46 |     // Disallow copying and assignment
47 |     gemvDependency(const gemvDependency&) = delete;
48 |     gemvDependency& operator=(const gemvDependency&) = delete;
49 | 
50 | 
51 | };


--------------------------------------------------------------------------------
/pipeline/include/networkManager.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "comm.h"
 4 | #include "config.h"
 5 | #include "vortexData.cuh"
 6 | #include <vector>
 7 | 
 8 | class SimpleThreadSync {
 9 | 	// implement simple thread synchronization methods for c++ std::thread
10 | 	std::mutex mtx;
11 | 	std::condition_variable cv;
12 | 	int count;
13 | 	int total;
14 | 
15 | public:
16 | 	SimpleThreadSync(int total)
17 | 		: total(total)
18 | 		, count(0) { }
19 | 	void barrier() {
20 | 		std::unique_lock<std::mutex> lck(mtx);
21 | 		count++;
22 | 		if(count == total) {
23 | 			count = 0;
24 | 			cv.notify_all();
25 | 		} else {
26 | 			cv.wait(lck);
27 | 		}
28 | 	}
29 | 
30 | 	std::mutex& getMutex() {
31 | 		return mtx;
32 | 	}
33 | };
34 | 
35 | enum class WorkerOp
36 | {
37 | 	STOP,
38 | 	RUN,
39 | 	UPDATE,
40 | 	CONFIG,
41 | };
42 | 
43 | struct SharedState {
44 | 	mscclpp::UniqueId uniqueId;
45 | 	WorkerOp op;
46 | 	std::vector<vortexUpdateData> * updates_ptr;
47 | 	std::vector<vortexConfigData> * config_ptr;
48 | };
49 | 
50 | #ifdef ENABLE_MPI
51 | class NetworkManager
52 | {
53 | public:
54 |     int nranks;
55 |     int rank;
56 |     bool initialized;
57 |     void init(int argc, char** argv);
58 |     void finalize();
59 | 
60 | };
61 | 
62 | extern std::shared_ptr<NetworkManager> netmgr;
63 | #else
64 | extern SharedState shared_state;
65 | // sync between nranks workers
66 | extern std::unique_ptr<SimpleThreadSync> worker_sync;
67 | // sync between nranks workers and the management/main thread
68 | extern std::unique_ptr<SimpleThreadSync> global_sync;
69 | #endif
70 | 
71 | 


--------------------------------------------------------------------------------
/pipeline/include/offloadKernel.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include "cutlass/cutlass.h"
 4 | #include <cuda_runtime.h>
 5 | #include "cuda_fp16.h"
 6 | #include "config.h"
 7 | 
 8 | 
 9 | __global__ void moveKVcacheKernel(int finished_req_num, int32_t * finished_index,
10 |                                          int32_t* kv_indptr, int32_t* kv_indices, half* output, half* kv_data, int page_mem_size, bool host_to_gpu = true);


--------------------------------------------------------------------------------
/pipeline/include/sleep.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | __global__ void cudaSleep(int us);


--------------------------------------------------------------------------------
/pipeline/src/gemvDependency.cu:
--------------------------------------------------------------------------------
 1 | #include "gemvDependency.cuh"
 2 | 
 3 | __global__ void setReadyKernel(int * flag, int batch) {
 4 |     // use atomic add 
 5 |     atomicAdd(flag, batch);
 6 | }
 7 | 
 8 | __global__ void clearReadyKernel(int * flag) {
 9 |     // set to 0
10 |     int k = atomicExch(flag, 0);
11 |     // printf("clearReadyKernel: %d\n", k);
12 | }
13 | 
14 | __global__ void waitReadyKernel(int * flag, int desired_batch) {
15 |     int t =0;
16 |     t = atomicAdd(flag, 0);
17 |     while (t < desired_batch) {
18 |         // printf("batch %d is not ready\n", t);
19 |         t = atomicAdd(flag, 0);
20 |     }
21 |     // printf("batch %d is ready\n", t);
22 | }


--------------------------------------------------------------------------------
/pipeline/src/generate-gemm/.gitignore:
--------------------------------------------------------------------------------
1 | *.gen
2 | *.cu
3 | *.cuh
4 | 


--------------------------------------------------------------------------------
/pipeline/src/generate-gemm/Makefile:
--------------------------------------------------------------------------------
1 | GENERATED := gemmFactory.cu cutlassGemmExternDeclearation.cuh
2 | all: $(GENERATED)
3 | $(GENERATED): genGEMM.py gemmFactory.in
4 | 	rm -f *.cu *.cuh
5 | 	python3 genGEMM.py
6 | 


--------------------------------------------------------------------------------
/pipeline/src/generate-gemm/gemmFactory.in:
--------------------------------------------------------------------------------
 1 | #include "gemmFactory.cuh"
 2 | 
 3 | #include <unordered_map>
 4 | 
 5 | #include "cutlassGemmExternDeclearation.cuh"
 6 | 
 7 | std::unordered_map<std::string, BaseGEMMWrapper*(*)()> GEMMGenMap = {
 8 | @map_entries@
 9 | };
10 | BaseGEMMWrapper* generateGEMM(std::string tag) {
11 |   auto it = GEMMGenMap.find(tag);
12 |   if (it == GEMMGenMap.end())
13 |     return nullptr;
14 |   else
15 |     return it->second();
16 | }
17 | 


--------------------------------------------------------------------------------
/pipeline/src/networkManager.cu:
--------------------------------------------------------------------------------
 1 | #include "networkManager.cuh"
 2 | #include "config.h"
 3 | #include <iostream>
 4 | 
 5 | #ifdef ENABLE_MPI
 6 | std::shared_ptr<NetworkManager> netmgr;
 7 | 
 8 | void NetworkManager::init(int argc, char** argv) {
 9 | #ifdef ENABLE_NETWORK
10 | 	// Initialize the MPI environment
11 | 	MPI_Init(&argc, &argv);
12 | 	// Get the number of processes
13 | 	MPI_Comm_size(MPI_COMM_WORLD, &nranks);
14 | 	// Get the rank of the process
15 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
16 | 
17 | 	spdlog::info("Hello world from rank {} out of {} ranks", rank, nranks);
18 | 
19 | 	// Print off a hello world message
20 | 	std::cout << "Hello world from rank " << rank << " out of " << nranks << " ranks" << std::endl;
21 | 
22 | 	initialized = true;
23 | #endif
24 | }
25 | 
26 | void NetworkManager::finalize() {
27 | #ifdef ENABLE_NETWORK
28 | 	if (initialized) {
29 | 		MPI_Finalize();
30 | 	}
31 | #endif
32 | }
33 | #else // ENABLE_MPI
34 | 
35 | SharedState shared_state;
36 | std::unique_ptr<SimpleThreadSync> worker_sync;
37 | std::unique_ptr<SimpleThreadSync> global_sync;
38 | 
39 | #endif // ENABLE_MPI
40 | 


--------------------------------------------------------------------------------
/pipeline/src/offloadKernel.cu:
--------------------------------------------------------------------------------
 1 | #include "offloadKernel.cuh"
 2 | #include <cstdio>
 3 | 
 4 | __device__ void pageCopy(half* input, half* output, int page_mem_size){
 5 | 
 6 |     int copyIter = page_mem_size / sizeof(float4) / blockDim.x;
 7 |     // printf("copyIter: %d\n", copyIter);
 8 |     float4* input4 = (float4*)input;
 9 |     float4* output4 = (float4*)output;
10 | 
11 |     for (int i = 0; i < copyIter; i++){
12 |         output4[i * blockDim.x + threadIdx.x] = input4[i * blockDim.x + threadIdx.x];
13 |     }
14 | }
15 | 
16 | __global__ void moveKVcacheKernel(int finished_req_num, int32_t * finished_index, 
17 |                                        int32_t* kv_indptr, int32_t* kv_indices, half* host_ptr, half* kv_data, int page_mem_size, bool host_to_gpu){
18 |     page_mem_size /= sizeof(half);
19 |     for (int i = 0; i < finished_req_num; i++){
20 |         int idx = finished_index[i];
21 |         int start = kv_indptr[idx];
22 |         int end = kv_indptr[idx + 1];
23 | 
24 |         for (int j = start + blockIdx.x; j < end; j += gridDim.x){
25 |             int page_idx = kv_indices[j];
26 |             half* page = kv_data + page_idx * page_mem_size;
27 |             half* host_page = host_ptr + j * page_mem_size;
28 |             // printf("page_idx: %d\n", page_idx);
29 |             if (host_to_gpu)
30 |                 pageCopy(host_page, page, page_mem_size);
31 |             else
32 |                 pageCopy(page, host_page, page_mem_size);
33 |         }
34 |     }
35 | }


--------------------------------------------------------------------------------
/pipeline/src/run.sh:
--------------------------------------------------------------------------------
1 | mpirun -np 4 --allow-run-as-root sh -c "nsys profile --force-overwrite true  --stat=true -o output_\$OMPI_COMM_WORLD_RANK.prof ../build/test_compute > output_\$OMPI_COMM_WORLD_RANK.txt 2>error_\$OMPI_COMM_WORLD_RANK.txt"


--------------------------------------------------------------------------------
/pipeline/src/sleep.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include <chrono>
 3 | #include <cuda/std/chrono>
 4 | 
 5 | __global__ void cudaSleep(int us) {
 6 |     auto start = cuda::std::chrono::high_resolution_clock::now();
 7 |     while (cuda::std::chrono::duration_cast<cuda::std::chrono::microseconds>(cuda::std::chrono::high_resolution_clock::now() - start).count() < us);
 8 |     {
 9 |        
10 |     }
11 | }


--------------------------------------------------------------------------------
/pipeline/src/tensorLogger.cu:
--------------------------------------------------------------------------------
1 | #include <string>
2 | #include <cutlass/half.h>
3 | #include "spdlog/spdlog.h"
4 | #include "tensor.cuh"
5 | #include "tensorLogger.cuh"
6 | #include <cuda_fp16.h>
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/pipeline/src/test_dual.cu:
--------------------------------------------------------------------------------
 1 | #include "dualWrapper.cuh"
 2 | #include <iostream>
 3 | #include "spdlog/sinks/basic_file_sink.h"
 4 | #include <string>
 5 | 
 6 | int main() {
 7 |     DualWrapper<128, 128, 32, 64, 64, 32, 1, 5, cutlass::layout::RowMajor, cutlass::layout::RowMajor, cutlass::layout::RowMajor> dw;
 8 |     int M = 128;
 9 |     int N = 128;
10 |     int K = 128;
11 | 
12 |     dw.set_shape(M, N, K);
13 |     cutlass::half_t *host_tensors[7];
14 |     for (int i = 0; i < 7; i++) {
15 |         host_tensors[i] = new cutlass::half_t[M*N];
16 |         for (int j = 0; j < M; j++) {
17 |             for (int k = 0; k < N; k++) {
18 |                 host_tensors[i][j*N+k] = cutlass::half_t(float((j+k))/20/128);
19 | 
20 |             }
21 |         }
22 |     }
23 | 
24 |     cutlass::half_t *device_tensors[7];
25 |     for (int i = 0; i < 7; i++) {
26 |         cudaMalloc(&device_tensors[i], M*N*sizeof(cutlass::half_t));
27 |         cudaMemcpy(device_tensors[i], host_tensors[i], M*N*sizeof(cutlass::half_t), cudaMemcpyHostToDevice);
28 |     }
29 |     vortexWeight b1, b2;
30 |     b1.ptr = (half* )device_tensors[1];
31 |     b1.N = N;
32 |     b1.K = K;
33 |     b2.ptr = (half* )device_tensors[2];
34 |     b2.N = N;
35 |     b2.K = K;
36 |     
37 |     dw.set_weight(b1,b2);
38 | 
39 | 
40 |     pllmTensor<cutlass::half_t> a = pllmTensor<cutlass::half_t>(device_tensors[0], M, K, PllmLayout::ROW_MAJOR);
41 |     pllmTensor<cutlass::half_t> c = pllmTensor<cutlass::half_t>(device_tensors[3], M, N, PllmLayout::ROW_MAJOR);
42 |     pllmTensor<cutlass::half_t> d0 = pllmTensor<cutlass::half_t>(device_tensors[4], M, N, PllmLayout::ROW_MAJOR);
43 |     pllmTensor<cutlass::half_t> d1 = pllmTensor<cutlass::half_t>(device_tensors[5], M, N, PllmLayout::ROW_MAJOR);
44 |     pllmTensor<cutlass::half_t> d2 = pllmTensor<cutlass::half_t>(device_tensors[6], M, N, PllmLayout::ROW_MAJOR);
45 | 
46 |     dw.setA(a);
47 |     dw.setC(c);
48 |     dw.setD(d0, d1, d2);
49 |     dw.init();
50 |     dw.set_weight(b1,b2);
51 |     dw.setStream(0);
52 |     std::string private_file_name = "dual.txt";
53 | 	auto private_logger = spdlog::basic_logger_mt("private_logger", private_file_name, true);
54 |     dw.run().log(private_logger);
55 |     cudaDeviceSynchronize();
56 | 
57 |     // copy back d0, d1, d2
58 |     for (int i = 4; i < 7; i++) {
59 |         cudaMemcpy(host_tensors[i], device_tensors[i], M*N*sizeof(cutlass::half_t), cudaMemcpyDeviceToHost);
60 |     }
61 | 
62 |     for (int i = 4; i < 7; i++) {
63 |         for (int j = 0; j < M; j++) {
64 |             for (int k = 0; k < N; k++) {
65 |                 std::cout << host_tensors[i][j*N+k] << " ";
66 |             }
67 |             std::cout << std::endl;
68 |         }
69 |         std::cout << std::endl;
70 |     }
71 |     
72 |     return 0;
73 | }


--------------------------------------------------------------------------------
/pipeline/utils/.gitignore:
--------------------------------------------------------------------------------
1 | *.pt
2 | *.log
3 | *.csv
4 | *.req_*
5 | *.schedule
6 | 


--------------------------------------------------------------------------------
/pipeline/utils/gen_req.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | prompt = sys.argv[1]
 4 | decode_len = int(sys.argv[2])
 5 | request_rate = int(sys.argv[3])
 6 | output_name = sys.argv[4]
 7 | prefill_len = -1
 8 | 
 9 | # get first word of prompt
10 | first_word = prompt.split(' ')[0]
11 | 
12 | if request_rate == 0:
13 |     request_interval = 0
14 | else:
15 |     request_interval = 1 / request_rate
16 | 
17 | with open(f"{output_name}", "w") as f:
18 |     for i in range(100000):
19 |         f.write(f"{i},{prefill_len},{decode_len},{request_interval*i}, {prompt}\n")


--------------------------------------------------------------------------------
/pipeline/utils/listToCSV.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import re
 3 | import argparse
 4 | 
 5 | def list_to_csv(input_file, output_file):
 6 |     with open(input_file, 'r') as f:
 7 |         data = f.read()
 8 | 
 9 |     # Use regex to split data into cycles (both regular and skip cycles)
10 |     cycle_pattern = r'(------------------ (?:Cycle|Skip Cycle) (\d+) ------------------)'
11 |     cycles = re.split(cycle_pattern, data)[1:]
12 | 
13 |     rows = []
14 |     for i in range(0, len(cycles), 3):
15 |         cycle_type = cycles[i].strip()  # Cycle type (Cycle or Skip Cycle)
16 |         cycle_number = cycles[i + 1].strip()  # Cycle number
17 |         fields = cycles[i + 2].strip().splitlines()  # Fields within the cycle
18 | 
19 |         # Determine if it's a skip cycle
20 |         is_skip_cycle = 'Skip' in cycle_type
21 | 
22 |         # Parse fields and values
23 |         field_values = {'Cycle': cycle_number, 'is_skip_cycle': is_skip_cycle}
24 |         for field in fields:
25 |             key, value = map(str.strip, field.split(':', 1))
26 |             field_values[key] = value
27 | 
28 |         rows.append(field_values)
29 | 
30 |     # Get all possible field names
31 |     all_fields = set()
32 |     for row in rows:
33 |         all_fields.update(row.keys())
34 | 
35 |     # Write to CSV
36 |     with open(output_file, 'w', newline='') as csvfile:
37 |         writer = csv.DictWriter(csvfile, fieldnames=sorted(all_fields))
38 |         writer.writeheader()
39 |         for row in rows:
40 |             writer.writerow(row)
41 | 
42 | def main():
43 |     # Initialize the argument parser
44 |     parser = argparse.ArgumentParser(description='Parse cycle logs and convert them to CSV.')
45 |     parser.add_argument('input_file', type=str, help='Path to the input log file')
46 |     parser.add_argument('output_file', type=str, help='Path to the output CSV file')
47 | 
48 |     # Parse the arguments
49 |     args = parser.parse_args()
50 | 
51 |     # Call the parser function with input and output paths
52 |     list_to_csv(args.input_file, args.output_file)
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/pipeline/utils/plotSchedule.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import argparse
 4 | 
 5 | def plotSchedule(schedule_csv):
 6 |     
 7 |     df = pd.read_csv(schedule_csv)
 8 |     # schedule_csv = "_"+schedule_csv
 9 |     plt.plot(df['Cycle'], df['memory usage %'], marker='o', markersize=1, linestyle='-', color='b')
10 |     plt.title('Memory Usage % Across Cycles')
11 |     plt.xlabel('Cycle')
12 |     plt.ylabel('Memory Usage %')
13 |     plt.grid(True)
14 |     plt.savefig(f'{schedule_csv}.memory_usage.png')
15 |     
16 |     # plot decode effective bsz
17 |     plt.figure()
18 |     plt.plot(df['Cycle'], df['decode effective bsz'], marker='o', linestyle='-', color='g')
19 |     plt.title('Decode Effective Batch Size Across Cycles')
20 |     plt.xlabel('Cycle')
21 |     plt.ylabel('Decode Effective Batch Size')
22 |     plt.grid(True)
23 |     plt.savefig(f'{schedule_csv}.decode_effective_bsz.png')
24 |     
25 |     # plot prefill effective bsz
26 |     plt.figure()
27 |     plt.plot(df['Cycle'], df['prefill effective bsz'], marker='o', linestyle='-', color='r')
28 |     plt.title('Prefill Effective Batch Size Across Cycles')
29 |     plt.xlabel('Cycle')
30 |     plt.ylabel('Prefill Effective Batch Size')
31 |     plt.grid(True)
32 |     plt.savefig(f'{schedule_csv}.prefill_effective_bsz.png')
33 |     
34 |     # plot both decode and prefill effective bsz in one plot
35 |     plt.figure()
36 |     plt.plot(df['Cycle'], df['decode effective bsz'], marker='o', linestyle='-', color='g', label='Decode Effective Batch Size')
37 |     plt.plot(df['Cycle'], df['prefill effective bsz'], marker='o', linestyle='-', color='r', label='Prefill Effective Batch Size')
38 |     plt.title('Decode and Prefill Effective Batch Size Across Cycles')
39 |     plt.xlabel('Cycle')
40 |     plt.ylabel('Effective Batch Size')
41 |     plt.legend()
42 |     plt.grid(True)
43 |     plt.savefig(f'{schedule_csv}.effective_bsz.png')
44 |     
45 | 
46 | if __name__ == '__main__':
47 |     # plotSchedule('512_large_kv_cache.schedule.csv')
48 |     # plotSchedule('512.schedule.csv')
49 |     # plotSchedule('740.schedule.csv')
50 |     arg_parser = argparse.ArgumentParser()
51 |     arg_parser.add_argument("--schedule_csv", type=str, help="path to schedule csv file")
52 |     args = arg_parser.parse_args()
53 |     plotSchedule(args.schedule_csv)


--------------------------------------------------------------------------------
/pipeline/utils/plot_trend.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | filename = sys.argv[1]
 4 | 
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | df = pd.read_csv(filename, header=None, names=['prefill', 'decode'])
 9 | 
10 | df["sum"] = df["prefill"] + df["decode"]
11 | 
12 | df = df.drop(columns=["decode"])
13 | 
14 | df.plot()
15 | 
16 | # set y limit to 0 to 2100
17 | plt.ylim(0, 2100)
18 | 
19 | plt.savefig(f"{filename}.png")


--------------------------------------------------------------------------------
/pipeline/utils/request_info.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | class NewRequestInfo:
 3 |     """
 4 |     Request info for incoming request
 5 |     NOTE (Yilong): add support for offloading / onloading KV-Cache
 6 |     """
 7 |     req_idx: int
 8 |     prompt: list[int]
 9 |     output_len : int
10 |     start_time: float
11 | 
12 | class NewRequestQueue:
13 |     """
14 |     Thread-safe request deque as request buffer.
15 |     """
16 |     def __init__(self) -> None:
17 |         self._queue = deque()
18 |     
19 |     @property
20 |     def size(self) -> int:
21 |         return len(self._queue)
22 | 
23 |     def put(self, req: NewRequestInfo):
24 |         self._queue.append(req)
25 |     
26 |     def get(self) -> NewRequestInfo:
27 |         assert len(self._queue) > 0, "Queue is empty"
28 |         return self._queue.popleft()
29 |     
30 |     def clear(self) -> None:
31 |         self._queue.clear()
32 | 
33 | class FlyRequestInfo:
34 |     """
35 |     Request info for on-the-fly request
36 |     NOTE (Yilong): add support for offloading / onloading KV-Cache
37 |     """
38 |     
39 |     def __init__(self, req_idx: int, input: list[int], output: list[int], prompt: list[int], request_comein_time: float, 
40 |                  chunked_prefill: bool, kv_cache, encode_latency: float, 
41 |                  decode_start_at: float, decode_latency: float, output_len: int, input_len: int):
42 |         self.req_idx = req_idx
43 |         self.input = input
44 |         self.output = output
45 |         self.prompt = prompt
46 |         self.chunked_prefill = chunked_prefill
47 |         self.kv_cache = kv_cache
48 |         self.encode_latency = encode_latency
49 |         self.decode_start_at = decode_start_at
50 |         self.decode_latency = decode_latency
51 |         self.output_len = output_len
52 |         self.input_len = input_len
53 |         self.request_comein_time = request_comein_time
54 | 
55 |     def finish(self) -> None:
56 |         self.kv_cache.release()
57 | 
58 |     


--------------------------------------------------------------------------------
/serve.sh:
--------------------------------------------------------------------------------
 1 | current_dir=$(pwd)
 2 | parentdir="$(dirname "$current_dir")"
 3 | mkdir -p $parentdir/hf
 4 | 
 5 | export HF_HOME=$parentdir/hf
 6 | HF_HOME=$parentdir/hf
 7 | #check if token is cached
 8 | if [ ! -f $HF_HOME/token ]; then
 9 |     echo "Please login to Hugging Face to cache your token."
10 |     huggingface-cli login
11 | fi
12 | 
13 | 
14 | cd pipeline/utils
15 | read -e -p "Prompt [default: The University of Washington is located]: " -i "The University of Washington is located" prompt
16 | read -e -p "Decode length [default: 100]: " -i "100" decode_length
17 | read -e -p "Output file [default: trace.csv]: " -i "trace.csv" output_file
18 | 
19 | # Prompt for model selection and map the selection to a specific model path
20 | echo "Select model:"
21 | echo "1) llama2-70B"
22 | echo "2) llama3-70B"
23 | echo "3) llama3.1-70B"
24 | echo "4) llama3-8B"
25 | echo "5) llama3.1-8B"
26 | echo "6) Qwen2-72B"
27 | 
28 | read -p "Enter the number corresponding to your model choice: " model_choice
29 | 
30 | case $model_choice in
31 |     1)
32 |         config_path="../config_all/llama2-70B/2048.json"
33 |         ;;
34 |     2)
35 |         config_path="../config_all/llama3-70B/2048.json"
36 |         ;;
37 |     3)
38 |         config_path="../config_all/llama3.1-70B/2048.json"
39 |         ;;
40 |     4)
41 |         config_path="../config_all/llama3-8B/correct_40G/1024.json"
42 |         ;;
43 |     5)
44 |         config_path="../config_all/llama3.1-8B/1024.json"
45 |         ;;
46 |     6)
47 |         config_path="../config_all/qwen2-72B/2048.json"
48 |         ;;
49 |     *)
50 |         echo "Invalid choice. Defaulting to llama3-8B."
51 |         config_path="../config_all/llama3-8B/1024.json"
52 |         ;;
53 | esac
54 | 
55 | 
56 | python gen_req.py "${prompt}" ${decode_length} 0 ${output_file}
57 | 
58 | python serve_8B.py -t ${output_file} -c ${config_path} -r 200
59 | output_file_base="${output_file%.csv}"
60 | cat ${output_file_base}.req_words


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | # init submodule dependencies
 2 | git submodule init
 3 | git submodule update
 4 | 
 5 | # install dependencies
 6 | apt update
 7 | apt install python3
 8 | pip3 install cmake
 9 | apt install libopenmpi-dev
10 | apt install wget
11 | pip install torch
12 | apt install libspdlog-dev
13 | apt-get install libglib2.0-0
14 | apt install pigz
15 | pip install wget
16 | pip install pandas
17 | pip install seaborn
18 | pip install mypy
19 | pip install transformers
20 | pip install --upgrade pydantic
21 | pip install sentencepiece
22 | apt-get install git-lfs
23 | apt-get install python3-pybind11
24 | apt-get install nlohmann-json3-dev
25 | 
26 | # fix pybind header compile error
27 | sed -i '446,486s/^/\/\//' /usr/include/pybind11/detail/type_caster_base.h
28 | 
29 | # install cmake 3.29.0
30 | cd ..
31 | wget https://github.com/Kitware/CMake/releases/download/v3.29.0-rc2/cmake-3.29.0-rc2-linux-x86_64.sh
32 | chmod +x cmake-3.29.0-rc2-linux-x86_64.sh
33 | ./cmake-3.29.0-rc2-linux-x86_64.sh --prefix=/usr/local --exclude-subdir
34 | 
35 | 
36 | # install nsight
37 | NSIGHT="NsightSystems-linux-cli-public-2023.4.1.97-3355750.deb"
38 | if [[ ! -f "$NSIGHT" ]]; then
39 |   wget https://developer.download.nvidia.com/devtools/nsight-systems/$NSIGHT
40 |   dpkg -i ./$NSIGHT
41 | fi
42 | 
43 | cd Nanoflow
44 | 
45 | 
46 | 
47 | 
48 | # build mscclpp
49 | cd 3rdparty/mscclpp
50 | git reset --hard cdaf3aea3d767ba65dd3b08984d76bd50615f92e
51 | 
52 | cd ../..
53 | for repo in mscclpp; do
54 |   cat 3rdparty/patches/${repo}/*.patch | patch -p1 -d 3rdparty/${repo}
55 | done
56 | 
57 | cd 3rdparty/mscclpp
58 | 
59 | mkdir -p build
60 | cd build
61 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/mscclpp -DBUILD_PYTHON_BINDINGS=OFF ..
62 | make -j mscclpp mscclpp_static
63 | make install/fast
64 | cd ../../../
65 | 
66 | # fix spdlog v1.14.0 + cuda 12.1 compatibility bug
67 | for repo in spdlog; do
68 |   cat 3rdparty/patches/${repo}/*.patch | patch -p1 -d 3rdparty/${repo}
69 | done
70 | 
71 | 
72 | cd pipeline
73 | 
74 | # download and trace visualizer
75 | cd utils
76 | curl -LO https://get.perfetto.dev/trace_processor
77 | chmod +x ./trace_processor
78 | cd ..
79 | 
80 | # generate gemm lib
81 | cd src/generate-gemm
82 | python3 genGEMM.py
83 | cd ../../
84 | 
85 | # build pllm
86 | mkdir -p build
87 | cd build
88 | cmake ..
89 | make -j 256
90 | 
91 | # set up libstdc++.so.6 directory
92 | 
93 | export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
94 | 
95 | ./test_compute ../config_all/llama3-8B/1024.json
96 | 
97 | 


--------------------------------------------------------------------------------