├── .gitignore ├── .gitmodules ├── 3rdparty └── patches │ ├── cutlass │ └── 0001-Expose-internal-GEMM-fields-for-pllm.patch │ ├── mscclpp │ ├── mscclpp_assert_fail_workaround.patch │ └── no_rlimit_in_docker.patch │ └── spdlog │ └── cuda12.1_fmt_error_fix.patch ├── LICENSE ├── README.md ├── datasets ├── .gitignore ├── gen.sh ├── traces │ ├── lmsys.csv │ ├── sharegpt.csv │ └── splitwise.csv └── utils │ ├── gen_const_len_req.py │ ├── generate_fix_trace.sh │ ├── generate_real_trace.sh │ ├── prepare_dataset.py │ ├── prepare_real_data.py │ ├── prepare_splitwise_data.py │ ├── prepare_synthetic_data.py │ ├── preprocess_lmsys.py │ ├── preprocess_sharegpt.py │ └── utils.py ├── figures ├── NanoflowLogo.png ├── OfflineThroughput.png ├── SampleOutput.png ├── SystemDesign.png ├── async-schedule.png ├── feasibility.png ├── online-latency.png ├── pipeline.gif └── serve.png ├── gemv ├── .gitignore ├── CMakeLists.txt ├── include │ ├── attention │ │ ├── decode.cuh │ │ ├── handler.cuh │ │ └── prefill.cuh │ ├── attention_impl.cuh │ ├── decode_attention_decl.cuh │ ├── prefill_attention_decl.cuh │ └── small_blk_utils.cuh ├── python │ ├── generate_batch_paged_decode_inst.py │ ├── generate_batch_paged_prefill_inst.py │ └── literal_map.py └── src │ ├── bench_batch_decode.cu │ ├── bench_batch_prefill.cu │ ├── cpu_reference.h │ ├── test_batch_decode.cu │ ├── test_batch_prefill.cu │ └── utils.h ├── groundtruth ├── detokenize.py ├── test.py └── tokenize1.py ├── installAnaconda.sh ├── modelDownload.sh ├── perf.sh ├── pipeline ├── CMakeLists.txt ├── config_all │ ├── .gitignore │ ├── llama2-70B │ │ ├── 1024.json │ │ ├── 2048.json │ │ ├── 768.json │ │ ├── correct_40G │ │ │ ├── 2048.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ │ ├── 1024.json │ │ │ ├── 2048.json │ │ │ ├── 768.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ ├── non-overlap.json │ │ └── pllm-offload.json │ ├── llama3-70B │ │ ├── 1024.json │ │ ├── 2048.json │ │ ├── 768.json │ │ ├── correct_40G │ │ │ ├── 2048.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ │ ├── 1024.json │ │ │ ├── 2048.json │ │ │ ├── 768.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── llama3-8B │ │ ├── 1024.json │ │ ├── correct_40G │ │ │ ├── 1024.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ │ ├── 1024.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── llama3.1-70B │ │ ├── 1024.json │ │ ├── 2048.json │ │ ├── 768.json │ │ ├── correct_40G │ │ │ ├── 2048.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ │ ├── 1024.json │ │ │ ├── 2048.json │ │ │ ├── 768.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── llama3.1-8B │ │ ├── 1024.json │ │ ├── correct_40G │ │ │ ├── 1024.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ │ ├── 1024.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── mixtral-8-7B │ │ ├── 1024.json │ │ ├── correct_40G │ │ │ ├── 1024.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ │ ├── 1024.json │ │ │ ├── nanobatch-only.json │ │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ └── qwen2-72B │ │ ├── 2048.json │ │ ├── correct_40G │ │ ├── 2048.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ │ ├── fewer_layers │ │ ├── 2048.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json ├── eval │ ├── .gitignore │ ├── baseline_data.py │ ├── calculate_speedup.py │ ├── clean_all.sh │ ├── eval-ablation │ │ ├── clean.sh │ │ └── run.py │ ├── eval-fix-offline │ │ ├── clean.sh │ │ └── run.py │ ├── eval-real-offline │ │ ├── clean.sh │ │ └── run.py │ ├── eval-real-online-1024 │ │ ├── clean.sh │ │ └── run.py │ ├── eval-real-online-2048 │ │ ├── auto_datapoint.py │ │ ├── clean.sh │ │ └── run.py │ ├── eval-real-online-768 │ │ ├── clean.sh │ │ └── run.py │ ├── eval.sh │ ├── eval_output_example.py │ ├── merge_results.py │ ├── plot_all.py │ ├── run_all.sh │ ├── run_offline.sh │ └── summarize.py ├── include │ ├── allocManager.cuh │ ├── comm.h │ ├── computeBound.cuh │ ├── config.h │ ├── cutlassGemmWrapper.cuh │ ├── cutlassGemmWrapperImpl.cuh │ ├── dualWrapper.cuh │ ├── eventManager.cuh │ ├── gemmFactory.cuh │ ├── gemmShape.cuh │ ├── gemvConfig.cuh │ ├── gemvDependency.cuh │ ├── gemvWrapper.cuh │ ├── helper.h │ ├── netWrapper.cuh │ ├── networkManager.cuh │ ├── offloadKernel.cuh │ ├── operatorWrapper.cuh │ ├── otherWrapper.cuh │ ├── pipeline.h │ ├── rms_norm.cuh │ ├── sleep.cuh │ ├── small_cuda_operator.cuh │ ├── tensor.cuh │ ├── tensorLogger.cuh │ ├── tensorManager.cuh │ └── vortexData.cuh ├── perf │ └── test.py ├── src │ ├── comm.cu │ ├── comm_test.cu │ ├── computeBound.cu │ ├── computeMain.cu │ ├── gemvDependency.cu │ ├── generate-gemm │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── gemmFactory.in │ │ └── genGEMM.py │ ├── load_config.cu │ ├── networkManager.cu │ ├── offloadKernel.cu │ ├── pipeline.cu │ ├── pipeline_local.cu │ ├── pipeline_nonoverlap.cu │ ├── pipeline_nonoverlap_local.cu │ ├── pipeline_nonoverlap_nanobatch.cu │ ├── pybind.cu │ ├── pythonProfiling.py │ ├── run.sh │ ├── sleep.cu │ ├── small_cuda_operator.cu │ ├── tensorLogger.cu │ ├── test.py │ ├── test_dual.cu │ └── vortexData.cu └── utils │ ├── .gitignore │ ├── frontend.py │ ├── gen_req.py │ ├── kv_cache.py │ ├── listToCSV.py │ ├── plotSchedule.py │ ├── plot_trend.py │ ├── pybindUtil.py │ ├── request_info.py │ ├── scheduler.py │ ├── serve.py │ ├── serve_8B.py │ ├── serve_8B_3_1.py │ ├── weightLoader.py │ ├── weightSaver.py │ └── weightSaver_3_1.py ├── serve.sh └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | *.out 8 | *.txt 9 | !CMakeLists.txt 10 | *.csv 11 | /temp 12 | *.a 13 | *.o 14 | *.nsys-rep 15 | *.sqlite 16 | /cutlassProfile/tools 17 | *.sass 18 | cutlass_profiler 19 | build 20 | .vscode 21 | *.pdf 22 | *.png 23 | __pycache__ 24 | *.prof 25 | new-small-gemv/src/generated/ 26 | pipeline/src/generated/ 27 | /detailedGemmKernelPerf/CMakeFiles/ 28 | *.json 29 | *.json.gz 30 | *.cmake 31 | *.stat 32 | trace_processor 33 | *.tar.gz 34 | !pipeline/config/* 35 | Anaconda3-2024.02-1-Linux-x86_64.sh 36 | *.log 37 | *.csv 38 | *.req_* 39 | *.schedule 40 | !figures/* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/flashinfer"] 2 | path = 3rdparty/flashinfer 3 | url = https://github.com/happierpig/flashinfer-ai.git 4 | 5 | [submodule "3rdparty/nvbench"] 6 | path = 3rdparty/nvbench 7 | url = https://github.com/NVIDIA/nvbench.git 8 | [submodule "3rdparty/gtest"] 9 | path = 3rdparty/gtest 10 | url = https://github.com/google/googletest.git 11 | [submodule "3rdparty/mscclpp"] 12 | path = 3rdparty/mscclpp 13 | url = https://github.com/microsoft/mscclpp.git 14 | [submodule "3rdparty/spdlog"] 15 | path = 3rdparty/spdlog 16 | url = https://github.com/gabime/spdlog.git 17 | [submodule "3rdparty/cutlass"] 18 | path = 3rdparty/cutlass 19 | url = https://github.com/NVIDIA/cutlass.git 20 | -------------------------------------------------------------------------------- /3rdparty/patches/cutlass/0001-Expose-internal-GEMM-fields-for-pllm.patch: -------------------------------------------------------------------------------- 1 | From 550e5c626f9f86ac8100155d767f52fc8c4dc815 Mon Sep 17 00:00:00 2001 2 | From: Alkaid 3 | Date: Tue, 26 Mar 2024 14:50:39 -0700 4 | Subject: [PATCH] Expose internal GEMM fields for pllm 5 | 6 | --- 7 | include/cutlass/gemm/device/gemm.h | 2 -- 8 | 1 file changed, 2 deletions(-) 9 | 10 | diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h 11 | index f0226354..805e3d2c 100644 12 | --- a/include/cutlass/gemm/device/gemm.h 13 | +++ b/include/cutlass/gemm/device/gemm.h 14 | @@ -346,8 +346,6 @@ class Gemm { 15 | } 16 | }; 17 | 18 | -private: 19 | - 20 | /// Kernel parameters object 21 | typename GemmKernel::Params params_; 22 | 23 | -- 24 | 2.44.0 25 | 26 | -------------------------------------------------------------------------------- /3rdparty/patches/mscclpp/mscclpp_assert_fail_workaround.patch: -------------------------------------------------------------------------------- 1 | diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp 2 | index 9ad116f..33325c8 100644 3 | --- a/include/mscclpp/poll_device.hpp 4 | +++ b/include/mscclpp/poll_device.hpp 5 | @@ -14,10 +14,10 @@ 6 | #define __assert_fail(__assertion, __file, __line, __function) ; 7 | #else // !defined(NDEBUG) 8 | #if defined(MSCCLPP_DEVICE_HIP) 9 | -extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, 10 | +extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, 11 | const char *__function); 12 | #else // !defined(MSCCLPP_DEVICE_HIP) 13 | -extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, 14 | +extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, 15 | const char *__function) __THROW; 16 | #endif // !defined(MSCCLPP_DEVICE_HIP) 17 | #endif // NDEBUG 18 | -------------------------------------------------------------------------------- /3rdparty/patches/mscclpp/no_rlimit_in_docker.patch: -------------------------------------------------------------------------------- 1 | For Runpod (i.e., docker), it does not allow rlimit. 2 | diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc 3 | index c9cea10..ed3e956 100644 4 | --- a/src/bootstrap/bootstrap.cc 5 | +++ b/src/bootstrap/bootstrap.cc 6 | @@ -278,7 +278,6 @@ void TcpBootstrap::Impl::bootstrapRoot() { 7 | 8 | std::memset(rankAddresses.data(), 0, sizeof(SocketAddress) * nRanks_); 9 | std::memset(rankAddressesRoot.data(), 0, sizeof(SocketAddress) * nRanks_); 10 | - setFilesLimit(); 11 | 12 | TRACE(MSCCLPP_INIT, "BEGIN"); 13 | /* Receive addresses from all ranks */ 14 | -------------------------------------------------------------------------------- /3rdparty/patches/spdlog/cuda12.1_fmt_error_fix.patch: -------------------------------------------------------------------------------- 1 | fmt upstream repo fixed this issue (https://github.com/fmtlib/fmt/pull/1818). 2 | But spdlog hasn't updated fmt for years (https://github.com/gabime/spdlog/issues/1662) 3 | Observed to impact cuda 12.1 but not 12.4 4 | diff --git a/include/spdlog/fmt/bundled/core.h b/include/spdlog/fmt/bundled/core.h 5 | index b51c1406..27b8c3f6 100644 6 | --- a/include/spdlog/fmt/bundled/core.h 7 | +++ b/include/spdlog/fmt/bundled/core.h 8 | @@ -241,7 +241,7 @@ 9 | # if defined(__cpp_nontype_template_args) && \ 10 | ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) || \ 11 | __cpp_nontype_template_args >= 201911L) && \ 12 | - !defined(__NVCOMPILER) && !defined(__LCC__) 13 | + !defined(__NVCOMPILER) && !defined(__LCC__) && !defined(__NVCC__) 14 | # define FMT_USE_NONTYPE_TEMPLATE_ARGS 1 15 | # else 16 | # define FMT_USE_NONTYPE_TEMPLATE_ARGS 0 17 | 18 | -------------------------------------------------------------------------------- /datasets/.gitignore: -------------------------------------------------------------------------------- 1 | !/traces/lmsys.csv 2 | !/traces/splitwise.csv 3 | !/traces/sharegpt.csv 4 | -------------------------------------------------------------------------------- /datasets/gen.sh: -------------------------------------------------------------------------------- 1 | cd ./utils 2 | ./generate_fix_trace.sh 3 | 4 | rates=$(seq 0 50) 5 | for rate in $rates; do 6 | echo "Generating traces for rate: $rate" 7 | bash ./generate_real_trace.sh $rate & 8 | done 9 | 10 | wait 11 | 12 | echo "All traces generated" 13 | -------------------------------------------------------------------------------- /datasets/utils/gen_const_len_req.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | prefill_len = int(sys.argv[1]) 4 | decode_len = int(sys.argv[2]) 5 | request_rate = int(sys.argv[3]) 6 | output_prefix = sys.argv[4] 7 | 8 | if request_rate == 0: 9 | request_interval = 0 10 | else: 11 | request_interval = 1 / request_rate 12 | 13 | with open(f"{output_prefix}/{prefill_len}-{decode_len}-{request_rate}.csv", 'w') as f: 14 | for i in range(100000): 15 | f.write(f"{i},{prefill_len},{decode_len},{request_interval*i}\n") -------------------------------------------------------------------------------- /datasets/utils/generate_fix_trace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | tracedir=../traces 3 | mkdir -p ${tracedir} 4 | fixTraceDir=${tracedir}/fixed 5 | mkdir -p ${fixTraceDir} 6 | 7 | 8 | input_output_pairs=( 9 | "512 512" 10 | "1024 512" 11 | "512 1024" 12 | "128 1024" 13 | "512 2" 14 | ) 15 | 16 | for pair in "${input_output_pairs[@]}"; do 17 | 18 | read input_len output_len <<< "$pair" 19 | echo "Generating trace for input_len: $input_len, output_len: $output_len" 20 | python3 gen_const_len_req.py $input_len $output_len 0 ${fixTraceDir} 21 | 22 | done -------------------------------------------------------------------------------- /datasets/utils/generate_real_trace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tracedir=../traces 4 | splitwise_output_dir=${tracedir}/splitwise 5 | lmsys_output_dir=${tracedir}/lmsys 6 | sharegpt_output_dir=${tracedir}/sharegpt 7 | mkdir -p ${splitwise_output_dir} 8 | mkdir -p ${lmsys_output_dir} 9 | mkdir -p ${sharegpt_output_dir} 10 | 11 | minute=5 12 | num_requests=15000 13 | 14 | # get the argument as rate 15 | rate=$1 16 | 17 | # splitwise 18 | filename="${splitwise_output_dir}/splitwise-rate-${rate}-${minute}min-reqs-${num_requests}-exp-delay.csv" 19 | safe_filename=$(echo "$filename" | sed 's/\([0-9]\)\.\([0-9]\)/\1_\2/') 20 | python3 ./prepare_dataset.py \ 21 | --output ${safe_filename} \ 22 | --request-rate ${rate} \ 23 | --time-delay-dist exponential_dist \ 24 | --tokenizer lmsys/longchat-13b-16k\ 25 | splitwise \ 26 | --num-requests 17563 \ 27 | --trace-path ${tracedir}/splitwise.csv \ 28 | --mode length 29 | # lmsys 30 | filename="${lmsys_output_dir}/lmsys-rate-${rate}-${minute}min-reqs-${num_requests}-exp-delay.csv" 31 | safe_filename=$(echo "$filename" | sed 's/\([0-9]\)\.\([0-9]\)/\1_\2/') 32 | python3 ./prepare_dataset.py \ 33 | --output ${safe_filename} \ 34 | --request-rate ${rate} \ 35 | --time-delay-dist exponential_dist \ 36 | --tokenizer lmsys/longchat-13b-16k\ 37 | splitwise \ 38 | --num-requests 50000 \ 39 | --trace-path ${tracedir}/lmsys.csv \ 40 | --mode length 41 | # sharegpt 42 | filename="${sharegpt_output_dir}/sharegpt-rate-${rate}-${minute}min-reqs-${num_requests}-exp-delay.csv" 43 | safe_filename=$(echo "$filename" | sed 's/\([0-9]\)\.\([0-9]\)/\1_\2/') 44 | python3 ./prepare_dataset.py \ 45 | --output ${safe_filename} \ 46 | --request-rate ${rate} \ 47 | --time-delay-dist exponential_dist \ 48 | --tokenizer lmsys/longchat-13b-16k\ 49 | splitwise \ 50 | --num-requests 50000 \ 51 | --trace-path ${tracedir}/sharegpt.csv \ 52 | --mode length -------------------------------------------------------------------------------- /datasets/utils/prepare_real_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import click 4 | from utils import dataset_dump, get_list_of_delays 5 | 6 | 7 | @click.command() 8 | @click.option("--dataset", 9 | required=True, 10 | type=str, 11 | help='Dataset path used for the test.') 12 | @click.option( 13 | "--num-requests", 14 | type=int, 15 | default=None, 16 | help= 17 | 'Number of requests to be generated. Default is dataset length. Will be capped to min(dataset, num_requests).' 18 | ) 19 | @click.option( 20 | "--op-tokens-per-word", 21 | type=float, 22 | default=1.3, 23 | help= 24 | 'Specify op tokens/word ratio. Useful to have model generate exactly as many tokens as needed by the dataset.' 25 | ) 26 | @click.option("--max-input-len", 27 | type=int, 28 | default=500000, 29 | help='Specify max input length.') 30 | @click.pass_obj 31 | def dataset(root_args, **kwargs): 32 | """Prepare dataset from real dataset.""" 33 | prompt_cnt = 0 34 | input_ids = [] 35 | output_lens = [] 36 | ratio = [] 37 | 38 | with open(kwargs['dataset'], 'r') as f: 39 | data_dict = json.load(f) 40 | 41 | if kwargs['num_requests'] is None: 42 | kwargs['num_requests'] = len(data_dict) 43 | else: 44 | kwargs['num_requests'] = min(kwargs['num_requests'], len(data_dict)) 45 | 46 | for req in data_dict: 47 | prompt = req['input'] + ' ' + req['instruction'] 48 | output = req['output'] 49 | line = root_args.tokenizer.encode(prompt) 50 | if len(line) > kwargs['max_input_len']: 51 | continue 52 | 53 | prompt_cnt += 1 54 | if prompt_cnt > kwargs['num_requests']: 55 | break 56 | 57 | input_ids.append(line) 58 | output_lens.append( 59 | int(len(output.split(' ')) * kwargs['op_tokens_per_word'])) 60 | 61 | prompt_tokens = len(line) 62 | prompt_words = len(prompt.split()) 63 | ratio.append(prompt_tokens / prompt_words) 64 | 65 | delays = get_list_of_delays(root_args.time_delay_dist, 66 | root_args.mean_time_bet_reqs, len(input_ids), 67 | root_args.random_seed) 68 | 69 | dataset_dump( 70 | input_ids, output_lens, delays, { 71 | "workload_type": "dataset", 72 | "tokenizer": root_args.tokenizer.__class__.__name__, 73 | "num_requests": kwargs['num_requests'], 74 | "delay_distr": root_args.time_delay_dist, 75 | "request_rate": root_args.request_rate 76 | }, root_args.output) -------------------------------------------------------------------------------- /datasets/utils/prepare_synthetic_data.py: -------------------------------------------------------------------------------- 1 | import click 2 | from utils import (dataset_dump, gen_random_tokens, get_list_of_delays, 3 | get_norm_dist_tokens) 4 | 5 | 6 | @click.command() 7 | @click.option("--num-requests", 8 | required=True, 9 | type=int, 10 | help='Number of requests to be generated') 11 | @click.option('--input-mean', 12 | required=True, 13 | type=int, 14 | help='normal dist mean for input tokens') 15 | @click.option('--input-stdev', 16 | required=True, 17 | type=int, 18 | help='normal dist stdev for input tokens') 19 | @click.option('--output-mean', 20 | required=True, 21 | type=int, 22 | help='normal dist mean for output tokens') 23 | @click.option('--output-stdev', 24 | required=True, 25 | type=int, 26 | help='normal dist stdev for output tokens') 27 | @click.pass_obj 28 | def token_norm_dist(root_args, **kwargs): 29 | """Prepare dataset by generating random tokens.""" 30 | input_ids = [] 31 | input_lens = [] 32 | output_lens = [] 33 | 34 | input_lens = get_norm_dist_tokens(kwargs['input_mean'], 35 | kwargs['input_stdev'], 36 | kwargs['num_requests'], 37 | root_args.random_seed) 38 | 39 | num_reqs = len(input_lens) 40 | output_lens = get_norm_dist_tokens(kwargs['output_mean'], 41 | kwargs['output_stdev'], num_reqs, 42 | root_args.random_seed) 43 | delays = get_list_of_delays(root_args.time_delay_dist, 44 | root_args.mean_time_bet_reqs, num_reqs, 45 | root_args.random_seed) 46 | 47 | input_ids = gen_random_tokens(input_lens, root_args.tokenizer, 48 | root_args.random_seed) 49 | 50 | dataset_dump( 51 | input_ids, output_lens, delays, { 52 | "workload_type": "token-norm-dist", 53 | "input_mean": kwargs['input_mean'], 54 | "input_stdev": kwargs['input_stdev'], 55 | "output_mean": kwargs['output_mean'], 56 | "output_stdev": kwargs['output_stdev'], 57 | "num_requests": kwargs['num_requests'], 58 | "delay_distr": root_args.time_delay_dist, 59 | "request_rate": root_args.request_rate, 60 | "tokenize_vocabsize": root_args.tokenizer.vocab_size 61 | }, root_args.output) -------------------------------------------------------------------------------- /datasets/utils/preprocess_lmsys.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoTokenizer 3 | from tqdm import tqdm 4 | 5 | 6 | f = open("lmsys.csv", "w") 7 | f.write("TIMESTAMP,ContextTokens,GeneratedTokens\n") 8 | 9 | num_samples = 100000 10 | 11 | # If the dataset is gated/private, make sure you have run huggingface-cli login 12 | dataset = load_dataset("lmsys/lmsys-chat-1m") 13 | 14 | # sample num_samples samples from the dataset randomly 15 | dataset = dataset.shuffle(seed=42) 16 | dataset = dataset["train"] 17 | 18 | tokenizer = AutoTokenizer.from_pretrained("lmsys/longchat-13b-16k") 19 | 20 | collected_samples = 0 21 | for i in tqdm(range(len(dataset))): 22 | if len(dataset[i]["conversation"]) < 2: 23 | continue 24 | if dataset[i]["conversation"][0]["role"] != "user" or dataset[i]["conversation"][1]["role"] != "assistant": 25 | continue 26 | input_tokens = tokenizer.encode(dataset[i]["conversation"][0]['content']) 27 | output_tokens = tokenizer.encode(dataset[i]["conversation"][1]['content']) 28 | f.write(f"XXX,{len(input_tokens)},{len(output_tokens)}\n") 29 | collected_samples += 1 30 | if collected_samples >= num_samples: 31 | break 32 | 33 | # ContextTokens 92.20294 34 | # GeneratedTokens 207.39722 35 | # dtype: float64 -------------------------------------------------------------------------------- /figures/NanoflowLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/NanoflowLogo.png -------------------------------------------------------------------------------- /figures/OfflineThroughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/OfflineThroughput.png -------------------------------------------------------------------------------- /figures/SampleOutput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/SampleOutput.png -------------------------------------------------------------------------------- /figures/SystemDesign.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/SystemDesign.png -------------------------------------------------------------------------------- /figures/async-schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/async-schedule.png -------------------------------------------------------------------------------- /figures/feasibility.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/feasibility.png -------------------------------------------------------------------------------- /figures/online-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/online-latency.png -------------------------------------------------------------------------------- /figures/pipeline.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/pipeline.gif -------------------------------------------------------------------------------- /figures/serve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/d6b381e58110a8b5d08cfabd4a55c0d5d0ebef57/figures/serve.png -------------------------------------------------------------------------------- /gemv/.gitignore: -------------------------------------------------------------------------------- 1 | /src/generated/* -------------------------------------------------------------------------------- /gemv/include/attention_impl.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024 by FlashInfer team. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef FLASHINFER_ATTENTION_IMPL_CUH_ 17 | #define FLASHINFER_ATTENTION_IMPL_CUH_ 18 | 19 | #include "flashinfer/attention/cascade.cuh" 20 | 21 | #include "attention/decode.cuh" 22 | #include "attention/prefill.cuh" 23 | 24 | #endif // FLASHINFER_ATTENTION_IMPL_CUH_ 25 | -------------------------------------------------------------------------------- /gemv/include/small_blk_utils.cuh: -------------------------------------------------------------------------------- 1 | #ifndef FLASHINFER_SMALL_BLK_UTILS 2 | #define FLASHINFER_SMALL_BLK_UTILS 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace flashinfer 12 | { 13 | /*! 14 | * \brief The type of launch. Whether use small blk to run the kernel or not. 15 | */ 16 | enum class LaunchType 17 | { 18 | // Use all blk to launch, default flashinfer 19 | AllBlk = 0, 20 | // Use constrained sm to run. Need specify how many sm. 21 | SmallBlk = 1, 22 | }; 23 | 24 | inline std::string LaunchTypeToString(const LaunchType& lType) { 25 | switch (lType) { 26 | case LaunchType::AllBlk: 27 | return "All"; 28 | case LaunchType::SmallBlk: 29 | return "Small"; 30 | default: 31 | return "Unknown"; 32 | } 33 | } 34 | 35 | } // namespace flashinfer 36 | 37 | #define DISPATCH_LAUNCH(ltype, LTYPE, ...) \ 38 | switch(ltype) { \ 39 | case LaunchType::AllBlk: { \ 40 | constexpr LaunchType LTYPE = LaunchType::AllBlk; \ 41 | __VA_ARGS__ \ 42 | break; \ 43 | } \ 44 | case LaunchType::SmallBlk: { \ 45 | constexpr LaunchType LTYPE = LaunchType::SmallBlk; \ 46 | __VA_ARGS__ \ 47 | break; \ 48 | } \ 49 | default: { \ 50 | std::ostringstream err_msg; \ 51 | err_msg << "Unsupported launch type: " << int(ltype); \ 52 | throw std::invalid_argument(err_msg.str()); \ 53 | } \ 54 | } 55 | 56 | #endif // FLASHINFER_SMALL_BLK_UTILS -------------------------------------------------------------------------------- /gemv/python/generate_batch_paged_decode_inst.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 by FlashInfer team. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import sys 18 | import re 19 | from literal_map import ( 20 | kv_layout_literal, 21 | pos_encoding_mode_literal, 22 | dtype_literal, 23 | idtype_literal, 24 | launchtype_literal, 25 | ) 26 | from pathlib import Path 27 | 28 | 29 | def get_cu_file_str( 30 | group_size, head_dim, kv_layout, pos_encoding_mode, dtype_in, dtype_out, idtype, ltype 31 | ): 32 | content = """#include 33 | 34 | namespace flashinfer {{ 35 | 36 | constexpr PageStorage page_storage = PageStorage::kIndices; 37 | 38 | template cudaError_t BatchDecodeWithPagedKVCacheDispatched<{group_size}, {head_dim}, page_storage, {kv_layout}, {pos_encoding_mode}, {dtype_in}, {dtype_out}, {idtype}, {ltype}>( 39 | {dtype_in}* q, {idtype}* q_offset, 40 | paged_kv_t paged_kv, 41 | kv_partition_info_t<{idtype}> kv_partition_info, 42 | {dtype_out}* o, {dtype_out}* tmp, float* lse, 43 | size_t sm_blk, float sm_scale, float rope_scale, 44 | float rope_theta, cudaStream_t stream); 45 | 46 | }} 47 | """.format( 48 | kv_layout=kv_layout_literal[int(kv_layout)], 49 | group_size=group_size, 50 | head_dim=head_dim, 51 | pos_encoding_mode=pos_encoding_mode_literal[int(pos_encoding_mode)], 52 | dtype_in=dtype_literal[dtype_in], 53 | dtype_out=dtype_literal[dtype_out], 54 | idtype=idtype_literal[idtype], 55 | ltype=launchtype_literal[ltype], 56 | ) 57 | return content 58 | 59 | 60 | if __name__ == "__main__": 61 | pattern = ( 62 | r"batch_paged_decode_group_([0-9]+)_head_([0-9]+)_layout_([0-9]+)_posenc_([0-9]+)_" 63 | r"dtypein_([a-z0-9]+)_dtypeout_([a-z0-9]+)_idtype_([a-z0-9]+)_launchtype_([a-z]+)\.cu" 64 | ) 65 | 66 | compiled_pattern = re.compile(pattern) 67 | path = Path(sys.argv[1]) 68 | fname = path.name 69 | match = compiled_pattern.match(fname) 70 | with open(path, "w") as f: 71 | f.write(get_cu_file_str(*match.groups())) 72 | -------------------------------------------------------------------------------- /gemv/python/literal_map.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 by FlashInfer team. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | kv_layout_literal = { 18 | 0: "QKVLayout::kNHD", 19 | 1: "QKVLayout::kHND", 20 | } 21 | 22 | pos_encoding_mode_literal = { 23 | 0: "PosEncodingMode::kNone", 24 | 1: "PosEncodingMode::kRoPELlama", 25 | 2: "PosEncodingMode::kALiBi", 26 | } 27 | 28 | dtype_literal = { 29 | "f16": "half", 30 | "bf16": "nv_bfloat16", 31 | "e4m3": "__nv_fp8_e4m3", 32 | "e5m2": "__nv_fp8_e5m2", 33 | } 34 | 35 | idtype_literal = { 36 | "i32": "int32_t", 37 | "u32": "uint32_t", 38 | "i64": "int64_t", 39 | "u64": "uint64_t", 40 | } 41 | 42 | bool_literal = { 43 | 0: "false", 44 | 1: "true", 45 | } 46 | 47 | launchtype_literal = { 48 | "all": "LaunchType::AllBlk", 49 | "small": "LaunchType::SmallBlk", 50 | } -------------------------------------------------------------------------------- /groundtruth/detokenize.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | # Set the HF_HOME environment variable 5 | os.environ['HF_HOME'] = '/code/hf' 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaModel, LlamaForCausalLM 7 | # import torch 8 | 9 | 10 | 11 | model_name_or_path = "meta-llama/Llama-2-70b-chat-hf" 12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 13 | 14 | 15 | # Create a partial model with a fraction of layers, e.g., the first 12 layers 16 | 17 | 18 | # Prepare input 19 | input_text = "Hi, " 20 | inputs = tokenizer(input_text, return_tensors="pt") 21 | input_ids = inputs['input_ids'] 22 | input_ids = input_ids 23 | 24 | output_text = tokenizer.decode([1, 29915, 29885, 1811], skip_special_tokens=True) 25 | print(output_text) 26 | -------------------------------------------------------------------------------- /groundtruth/test.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | 5 | # Set the HF_HOME environment variable 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaModel, LlamaForCausalLM 7 | import torch 8 | # import torch 9 | 10 | 11 | 12 | model_name_or_path = "meta-llama/Llama-2-70b-chat-hf" 13 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 14 | 15 | # Load the full model 16 | model = LlamaForCausalLM.from_pretrained(model_name_or_path) 17 | 18 | 19 | # Create a partial model with a fraction of layers, e.g., the first 12 layers 20 | 21 | 22 | # Prepare input 23 | input_text = "Hi, I'm" 24 | inputs = tokenizer(input_text, return_tensors="pt") 25 | input_ids = inputs['input_ids'] 26 | print(input_ids.size()) 27 | 28 | # Perform inference and decode the output 29 | output = model.generate( 30 | input_ids, 31 | max_new_tokens=5, 32 | do_sample=True, # Activate sampling 33 | top_k=1, # Use max sampling (greedy sampling with randomness) 34 | temperature=1.0 # Optional: Control randomness. Higher values give more diversity. 35 | ) 36 | print(output) 37 | output_text = tokenizer.decode(output[0], skip_special_tokens=True) 38 | print(output_text) 39 | print("input_ids", input_ids) 40 | -------------------------------------------------------------------------------- /groundtruth/tokenize1.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | # Set the HF_HOME environment variable 5 | os.environ['HF_HOME'] = '/code/hf' 6 | from transformers import AutoTokenizer 7 | # import torch 8 | 9 | 10 | 11 | model_name_or_path = "meta-llama/Llama-2-70b-chat-hf" 12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 13 | 14 | 15 | # Create a partial model with a fraction of layers, e.g., the first 12 layers 16 | 17 | 18 | # Prepare input 19 | input_text = "To plan the visit to Seattle, you need to " 20 | inputs = tokenizer(input_text, return_tensors="pt") 21 | 22 | print(inputs) -------------------------------------------------------------------------------- /installAnaconda.sh: -------------------------------------------------------------------------------- 1 | wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh 2 | chmod +x ./Anaconda3-2024.02-1-Linux-x86_64.sh 3 | ./Anaconda3-2024.02-1-Linux-x86_64.sh 4 | 5 | echo "export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH" >> ~/.bashrc -------------------------------------------------------------------------------- /modelDownload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | current_dir=$(pwd) 4 | parentdir="$(dirname "$current_dir")" 5 | mkdir -p $parentdir/hf 6 | 7 | export HF_HOME=$parentdir/hf 8 | 9 | huggingface-cli login 10 | 11 | cd groundtruth 12 | python test.py 13 | 14 | cd ../pipeline/utils 15 | python weightSaver.py $parentdir/hf 16 | 17 | -------------------------------------------------------------------------------- /perf.sh: -------------------------------------------------------------------------------- 1 | cd datasets 2 | ./gen.sh 3 | cd .. 4 | cd pipeline/eval 5 | ./run_all.sh 6 | -------------------------------------------------------------------------------- /pipeline/config_all/.gitignore: -------------------------------------------------------------------------------- 1 | !*.json -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 40 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 42 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 44 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 256, 51 | "kqv3_size": 256 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/768.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 40 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 41 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 42 | "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor", 43 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 44 | "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor", 45 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 768, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 128, 51 | "kqv3_size": 128 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/correct_40G/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 400, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 400, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "NANOBATCH" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 400, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 42 | ], 43 | "global_batch_size": 2048, 44 | "nanobatch_1_size": 640, 45 | "kqv1_size": 256, 46 | "kqv3_size": 768 47 | }, 48 | "serve_configs": { 49 | "model": "meta-llama/Llama-2-70b-chat-hf", 50 | "actual_gpu_num": 8, 51 | "weight_path": "./nanoflow_weight/", 52 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 53 | "pipeline_type": "NON_OVERLAP" 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/fewer_layers/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 5, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 40 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 42 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 44 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 256, 51 | "kqv3_size": 256 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/fewer_layers/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 5, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/fewer_layers/768.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 5, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 40 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 41 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 42 | "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor", 43 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 44 | "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor", 45 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 768, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 128, 51 | "kqv3_size": 128 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 5, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "NANOBATCH" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 5, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "NON_OVERLAP" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "NANOBATCH" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "NON_OVERLAP" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama2-70B/pllm-offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 8192, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 28672, 12 | "max_position_embeddings": 4096, 13 | "model_type": "llama", 14 | "num_attention_heads": 64, 15 | "num_hidden_layers": 80, 16 | "num_key_value_heads": 8, 17 | "pretraining_tp": 1, 18 | "rms_norm_eps": 1e-05, 19 | "rope_theta": 10000.0, 20 | "rope_scaling": null, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.31.0.dev0", 24 | "use_cache": true, 25 | "vocab_size": 32000 26 | }, 27 | "model_configs": { 28 | "gpu_num": 8, 29 | "run_layer": 80, 30 | "allocate_kv_data_batch": 1480, 31 | "frame_page_size": 16, 32 | "max_batch_size": 2048, 33 | "gpu_mem": 68719476736, 34 | "page_mem_size": 32768 35 | }, 36 | "pipeline_configs": { 37 | "gemm_op_tag": [ 38 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 39 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 2048, 49 | "nanobatch_1_size": 640, 50 | "kqv1_size": 256, 51 | "kqv3_size": 768 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Llama-2-70b-chat-hf", 55 | "actual_gpu_num": 8, 56 | "weight_path": "./nanoflow_weight/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080", 58 | "pipeline_type": "PLLM_OFFLOAD" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 44 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 47 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 48 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 1024, 51 | "nanobatch_1_size": 384, 52 | "kqv1_size": 256, 53 | "kqv3_size": 256 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/768.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 42 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 43 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 44 | "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor", 45 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 46 | "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor", 47 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 48 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 768, 51 | "nanobatch_1_size": 384, 52 | "kqv1_size": 128, 53 | "kqv3_size": 128 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/correct_40G/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 400, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 400, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "NANOBATCH" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 400, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "NON_OVERLAP" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/fewer_layers/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 44 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 46 | "64_128_64_32_64_64_2_3_RowMajor_RowMajor_RowMajor", 47 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 48 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 1024, 51 | "nanobatch_1_size": 384, 52 | "kqv1_size": 256, 53 | "kqv3_size": 256 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/fewer_layers/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/fewer_layers/768.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_4_RowMajor_RowMajor_RowMajor", 42 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 43 | "128_256_64_64_64_64_1_3_RowMajor_RowMajor_RowMajor", 44 | "64_64_64_32_32_64_2_5_RowMajor_RowMajor_RowMajor", 45 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 46 | "64_128_64_32_64_64_3_3_RowMajor_RowMajor_RowMajor", 47 | "64_256_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", 48 | "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 768, 51 | "nanobatch_1_size": 384, 52 | "kqv1_size": 128, 53 | "kqv3_size": 128 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "NANOBATCH" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "NON_OVERLAP" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "NANOBATCH" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-70B/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 8192, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 28672, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "meta-llama/Meta-Llama-3-70B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_70B_3/", 59 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/1480bb72e06591eb87b0ebe2c8853127f9697bae", 60 | "pipeline_type": "NON_OVERLAP" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/correct_40G/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 100, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 100, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NANOBATCH_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 100, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NON_OVERLAP_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/fewer_layers/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 4, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 500, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 4, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NANOBATCH_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 4, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NON_OVERLAP_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NANOBATCH_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3-8B/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 43 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 44 | ], 45 | "global_batch_size": 1024, 46 | "nanobatch_1_size": 384, 47 | "kqv1_size": 384, 48 | "kqv3_size": 640 49 | }, 50 | "serve_configs": { 51 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 52 | "actual_gpu_num": 1, 53 | "weight_path": "./nanoflow_weight_8B/", 54 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 55 | "pipeline_type": "NON_OVERLAP_LOCAL" 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 32, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 480, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/correct_40G/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 32, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 100, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 32, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 100, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "NANOBATCH_LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 32, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 100, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "NON_OVERLAP_LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/fewer_layers/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 4, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 480, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 4, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 480, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "NANOBATCH_LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 4, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 480, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "NON_OVERLAP_LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 32, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 480, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "NANOBATCH_LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/llama3.1-8B/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": [ 10 | 128001, 11 | 128008, 12 | 128009 13 | ], 14 | "hidden_act": "silu", 15 | "hidden_size": 4096, 16 | "initializer_range": 0.02, 17 | "intermediate_size": 14336, 18 | "max_position_embeddings": 131072, 19 | "mlp_bias": false, 20 | "model_type": "llama", 21 | "num_attention_heads": 32, 22 | "num_hidden_layers": 32, 23 | "num_key_value_heads": 8, 24 | "pretraining_tp": 1, 25 | "rms_norm_eps": 1e-05, 26 | "rope_scaling": { 27 | "factor": 8.0, 28 | "low_freq_factor": 1.0, 29 | "high_freq_factor": 4.0, 30 | "original_max_position_embeddings": 8192, 31 | "rope_type": "llama3" 32 | }, 33 | "rope_theta": 500000.0, 34 | "tie_word_embeddings": false, 35 | "torch_dtype": "bfloat16", 36 | "transformers_version": "4.42.3", 37 | "use_cache": true, 38 | "vocab_size": 128256 39 | }, 40 | "model_configs": { 41 | "gpu_num": 1, 42 | "run_layer": 32, 43 | "allocate_kv_data_batch": 480, 44 | "frame_page_size": 16, 45 | "max_batch_size": 2048, 46 | "gpu_mem": 68719476736, 47 | "page_mem_size": 32768 48 | }, 49 | "pipeline_configs": { 50 | "gemm_op_tag": [ 51 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 52 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 53 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 54 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 55 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 56 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 57 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 58 | ], 59 | "global_batch_size": 1024, 60 | "nanobatch_1_size": 384, 61 | "kqv1_size": 384, 62 | "kqv3_size": 640 63 | }, 64 | "serve_configs": { 65 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 66 | "actual_gpu_num": 1, 67 | "weight_path": "./nanoflow_weight_8B_3_1/", 68 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f", 69 | "pipeline_type": "NON_OVERLAP_LOCAL" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "MixtralForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 1, 8 | "eos_token_id": 2, 9 | "hidden_act": "silu", 10 | "hidden_size": 4096, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 14336, 13 | "max_position_embeddings": 32768, 14 | "model_type": "mixtral", 15 | "num_attention_heads": 32, 16 | "num_experts_per_tok": 2, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "num_local_experts": 8, 20 | "output_router_logits": false, 21 | "rms_norm_eps": 1e-05, 22 | "rope_theta": 1000000.0, 23 | "router_aux_loss_coef": 0.02, 24 | "sliding_window": null, 25 | "tie_word_embeddings": false, 26 | "torch_dtype": "bfloat16", 27 | "transformers_version": "4.36.0.dev0", 28 | "use_cache": true, 29 | "vocab_size": 32000 30 | }, 31 | "model_configs": { 32 | "gpu_num": 8, 33 | "run_layer": 32, 34 | "allocate_kv_data_batch": 3072, 35 | "frame_page_size": 16, 36 | "max_batch_size": 6144, 37 | "gpu_mem": 68719476736, 38 | "page_mem_size": 32768 39 | }, 40 | "pipeline_configs": { 41 | "gemm_op_tag": [ 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 47 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 4096, 51 | "nanobatch_1_size": 2048, 52 | "kqv1_size": 1024, 53 | "kqv3_size": 1024 54 | }, 55 | "serve_configs": { 56 | "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_mixtral_8-7B/", 59 | "hf_path": "../../../hf/hub/models--mistralai--Mixtral-8x7B-Instruct-v0.1/snapshots/41bd4c9e7e4fb318ca40e721131d4933966c2cc1", 60 | "pipeline_type": "PLLM" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/correct_40G/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 100, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 100, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NANOBATCH_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 100, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NON_OVERLAP_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/fewer_layers/1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 4, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 500, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 4, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NANOBATCH_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 4, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NON_OVERLAP_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 44 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 46 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 47 | ], 48 | "global_batch_size": 1024, 49 | "nanobatch_1_size": 384, 50 | "kqv1_size": 384, 51 | "kqv3_size": 640 52 | }, 53 | "serve_configs": { 54 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 55 | "actual_gpu_num": 1, 56 | "weight_path": "./nanoflow_weight_8B/", 57 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 58 | "pipeline_type": "NANOBATCH_LOCAL" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pipeline/config_all/mixtral-8-7B/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "attention_bias": false, 7 | "attention_dropout": 0.0, 8 | "bos_token_id": 128000, 9 | "eos_token_id": 128009, 10 | "hidden_act": "silu", 11 | "hidden_size": 4096, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 14336, 14 | "max_position_embeddings": 8192, 15 | "model_type": "llama", 16 | "num_attention_heads": 32, 17 | "num_hidden_layers": 32, 18 | "num_key_value_heads": 8, 19 | "pretraining_tp": 1, 20 | "rms_norm_eps": 1e-05, 21 | "rope_scaling": null, 22 | "rope_theta": 500000.0, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "bfloat16", 25 | "transformers_version": "4.40.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 128256 28 | }, 29 | "model_configs": { 30 | "gpu_num": 1, 31 | "run_layer": 32, 32 | "allocate_kv_data_batch": 480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 43 | "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor" 44 | ], 45 | "global_batch_size": 1024, 46 | "nanobatch_1_size": 384, 47 | "kqv1_size": 384, 48 | "kqv3_size": 640 49 | }, 50 | "serve_configs": { 51 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 52 | "actual_gpu_num": 1, 53 | "weight_path": "./nanoflow_weight_8B/", 54 | "hf_path": "../../../hf/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", 55 | "pipeline_type": "NON_OVERLAP_LOCAL" 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1382, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/correct_40G/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 200, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 200, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "NANOBATCH_KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 200, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "NONOVERLAP_KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/fewer_layers/2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "NANOBATCH_KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 5, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1480, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "NONOVERLAP_KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/nanobatch-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1300, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "NANOBATCH_KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/config_all/qwen2-72B/non-overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "architectures": [ 4 | "Qwen2ForCausalLM" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 151643, 8 | "eos_token_id": 151645, 9 | "hidden_act": "silu", 10 | "hidden_size": 8192, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 29568, 13 | "max_position_embeddings": 32768, 14 | "max_window_layers": 80, 15 | "model_type": "qwen2", 16 | "num_attention_heads": 64, 17 | "num_hidden_layers": 80, 18 | "num_key_value_heads": 8, 19 | "rms_norm_eps": 1e-06, 20 | "rope_theta": 1000000.0, 21 | "sliding_window": 131072, 22 | "tie_word_embeddings": false, 23 | "torch_dtype": "bfloat16", 24 | "transformers_version": "4.40.1", 25 | "use_cache": true, 26 | "use_sliding_window": false, 27 | "vocab_size": 152064 28 | }, 29 | "model_configs": { 30 | "gpu_num": 8, 31 | "run_layer": 80, 32 | "allocate_kv_data_batch": 1300, 33 | "frame_page_size": 16, 34 | "max_batch_size": 2048, 35 | "gpu_mem": 68719476736, 36 | "page_mem_size": 32768 37 | }, 38 | "pipeline_configs": { 39 | "gemm_op_tag": [ 40 | "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", 41 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 42 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 43 | "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", 44 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 45 | "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", 46 | "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", 47 | "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", 48 | "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" 49 | ], 50 | "global_batch_size": 2048, 51 | "nanobatch_1_size": 640, 52 | "kqv1_size": 256, 53 | "kqv3_size": 768 54 | }, 55 | "serve_configs": { 56 | "model": "Qwen/Qwen2-72B-Instruct", 57 | "actual_gpu_num": 8, 58 | "weight_path": "./nanoflow_weight_qwen2-72B/", 59 | "hf_path": "../../../hf/hub/models--Qwen--Qwen2-72B-Instruct/snapshots/fddbbd7b69a1fd7cf9b659203b37ae3eb89059e1", 60 | "pipeline_type": "NONOVERLAP_KQVBIAS" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pipeline/eval/.gitignore: -------------------------------------------------------------------------------- 1 | *.ret 2 | *.log 3 | */results/* -------------------------------------------------------------------------------- /pipeline/eval/clean_all.sh: -------------------------------------------------------------------------------- 1 | current_dir=$(pwd) 2 | 3 | # get all folders start with "eval-" in the current directory 4 | 5 | eval_folders=$(find . -maxdepth 1 -type d -name "eval-*") 6 | 7 | for eval_folder in ${eval_folders[@]}; do 8 | echo "Running $eval_folder" 9 | cd $eval_folder 10 | 11 | ./clean.sh 12 | 13 | cd $current_dir 14 | done -------------------------------------------------------------------------------- /pipeline/eval/eval-ablation/clean.sh: -------------------------------------------------------------------------------- 1 | rm -r ./results/* -------------------------------------------------------------------------------- /pipeline/eval/eval-fix-offline/clean.sh: -------------------------------------------------------------------------------- 1 | rm -r ./results/* -------------------------------------------------------------------------------- /pipeline/eval/eval-fix-offline/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import argparse 5 | 6 | def main(): 7 | # Ensure the script is called with the correct number of arguments 8 | arg_parser = argparse.ArgumentParser() 9 | arg_parser.add_argument("--trace_base", type=str, default="../../../datasets/traces", help="The base directory containing the traces.") 10 | arg_parser.add_argument("--executor_base", type=str, default="../../utils", help="The base directory containing the executor.") 11 | arg_parse = arg_parser.parse_args() 12 | 13 | current_dir = os.getcwd() 14 | trace_base = os.path.abspath(arg_parse.trace_base) 15 | executor_base = arg_parse.executor_base 16 | result_base = os.path.join(current_dir, "results") 17 | 18 | fix_trace = os.path.join(trace_base, "fixed") 19 | 20 | # Create the result directory if it doesn't exist 21 | os.makedirs(result_base, exist_ok=True) 22 | 23 | # Loop through each trace file in the fixed trace directory 24 | for trace in os.listdir(fix_trace): 25 | trace_path = os.path.join(fix_trace, trace) 26 | print(f"Running offline throughput experiment trace: {trace_path}") 27 | 28 | base_trace_name = os.path.splitext(trace)[0] # Get the base name without extension 29 | parts = base_trace_name.split('-') # Split by '-' 30 | input_len = parts[0] 31 | output_len = parts[1] 32 | rate = parts[2] 33 | 34 | log_file = os.path.join(result_base, f"{input_len}-{output_len}-{rate}.log") 35 | result_file = os.path.join(result_base, f"{input_len}-{output_len}-{rate}.stat.csv") 36 | # Check if the output file already exists 37 | if os.path.isfile(result_file): 38 | print(f"Offline throughput experiment input_len: {input_len}, output_len: {output_len}, rate: {rate} already exists. Skipping...") 39 | continue 40 | 41 | # Construct the command 42 | command = [ 43 | "python", "serve_8B.py", 44 | f"--config_path=../config_all/llama2-70B/2048.json", 45 | f"--trace_path={trace_path}", 46 | f"--output_prefix={os.path.join(result_base, f'{input_len}-{output_len}-{rate}')}", 47 | f"--skip_cycles={2000 if int(output_len) < 10 else 10000}", 48 | f"--empty_weight=True", 49 | f"--run_cycles=1500" 50 | ] 51 | 52 | # Execute the command and log the output 53 | with open(log_file, "w") as log: 54 | subprocess.run(command, cwd=executor_base, stdout=log, stderr=log) 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /pipeline/eval/eval-real-offline/clean.sh: -------------------------------------------------------------------------------- 1 | rm -r ./results/* -------------------------------------------------------------------------------- /pipeline/eval/eval-real-online-1024/clean.sh: -------------------------------------------------------------------------------- 1 | rm -r ./results/* -------------------------------------------------------------------------------- /pipeline/eval/eval-real-online-2048/auto_datapoint.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def f(x): 4 | return x**2 +10 # Example function, replace with your actual function 5 | 6 | def binary_search_f(threshold): 7 | low = 1 8 | high = 50 9 | f0 = f(0) 10 | intermediate_points = [] 11 | 12 | while low <= high: 13 | mid = (low + high) // 2 14 | f_mid = f(mid) 15 | intermediate_points.append((mid, f_mid)) 16 | 17 | if f_mid - f0 > threshold: 18 | high = mid - 1 19 | else: 20 | low = mid + 1 21 | 22 | return low, intermediate_points 23 | 24 | def evaluate_larger_x(starting_x, step_size, threshold): 25 | x = starting_x 26 | f0 = f(0) 27 | larger_x_points = [] 28 | 29 | while f(x) <= 4 * f0: 30 | f_x = f(x) 31 | larger_x_points.append((x, f_x)) 32 | x += step_size 33 | 34 | return larger_x_points 35 | 36 | threshold = 10 37 | result, intermediate_points = binary_search_f(threshold) 38 | 39 | # Calculate the step size as the range of intermediate points divided by 5 40 | step_size = (max(intermediate_points, key=lambda point: point[0])[0] - min(intermediate_points, key=lambda point: point[0])[0]) // 5 41 | 42 | # Evaluate larger x values until f(x) > 4 * f(0) 43 | larger_x_points = evaluate_larger_x(result, step_size, 4 * f(0)) 44 | 45 | # Save intermediate points and larger x points to dataframes 46 | df_intermediate = pd.DataFrame(intermediate_points, columns=['x', 'f(x)']) 47 | df_larger_x = pd.DataFrame(larger_x_points, columns=['x', 'f(x)']) 48 | 49 | # Save the dataframes to CSV files 50 | df_intermediate.to_csv('intermediate_points.csv', index=False) 51 | df_larger_x.to_csv('larger_x_points.csv', index=False) 52 | 53 | # Print the results 54 | print(f"The first point where f(x) - f(0) > {threshold} is at x = {result}") 55 | print("Intermediate points:") 56 | print(df_intermediate) 57 | print("Larger x points:") 58 | print(df_larger_x) 59 | -------------------------------------------------------------------------------- /pipeline/eval/eval-real-online-2048/clean.sh: -------------------------------------------------------------------------------- 1 | rm -r ./results/* -------------------------------------------------------------------------------- /pipeline/eval/eval-real-online-768/clean.sh: -------------------------------------------------------------------------------- 1 | rm -r ./results/* -------------------------------------------------------------------------------- /pipeline/eval/eval_output_example.py: -------------------------------------------------------------------------------- 1 | import json 2 | example_json = { 3 | # offline throughput with const input and output len 4 | "offline_throughput_const": { 5 | "512-512": 1036, 6 | "1024-512": 1055, 7 | "512-1024": 1043, 8 | }, 9 | # offline throughput with input and output len from real trace 10 | "offline_throughput_real_trace": { 11 | "Splitwise": 1048, 12 | "LMSYS-Chat": 990, 13 | "ShareGPT": 936, 14 | }, 15 | # online throughput 16 | "online_throughput": { 17 | "Splitwise": { 18 | "request_rate": [5, 10, 15, 20], 19 | "normalized_latency": [0.2, 0.4, 0.6, 0.8], 20 | }, 21 | "LMSYS-Chat": { 22 | "request_rate": [5, 10, 15, 20], 23 | "normalized_latency": [0.2, 0.4, 0.6, 0.8], 24 | }, 25 | "ShareGPT": { 26 | "request_rate": [5, 10, 15, 20], 27 | "normalized_latency": [0.2, 0.4, 0.6, 0.8], 28 | } 29 | } 30 | } 31 | 32 | print(json.dumps(example_json, indent=4)) -------------------------------------------------------------------------------- /pipeline/eval/merge_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import os 4 | import json 5 | import sys 6 | 7 | # read first argument as resdir 8 | resdir = sys.argv[1] 9 | 10 | json_res = { 11 | # offline throughput with const input and output len 12 | "offline_throughput_const": { 13 | "512-512": 0, 14 | "1024-512": 0, 15 | "512-1024": 0, 16 | }, 17 | # offline throughput with input and output len from real trace 18 | "offline_throughput_real_trace": { 19 | "splitwise": 0, 20 | "lmsys": 0, 21 | "sharegpt": 0, 22 | }, 23 | # online throughput 24 | "online_throughput": { 25 | "splitwise": { 26 | "request_rate": [], 27 | "normalized_latency": [], 28 | }, 29 | "lmsys": { 30 | "request_rate": [], 31 | "normalized_latency": [], 32 | }, 33 | "sharegpt": { 34 | "request_rate": [], 35 | "normalized_latency": [], 36 | } 37 | } 38 | } 39 | 40 | 41 | # column format: total_time,token_per_second,token_per_second_per_gpu,total_cycle,cycle_time,average_ttft,average_tpot,average_normalize_latency 42 | 43 | for name in ["512-512", "1024-512", "512-1024"]: 44 | df = pd.read_csv(f"{resdir}/offline-{name}.csv") 45 | # get token_per_second_per_gpu 46 | json_res["offline_throughput_const"][name] = df["token_per_second_per_gpu"][0].astype(float) 47 | 48 | for name in ["splitwise", "lmsys", "sharegpt"]: 49 | df = pd.read_csv(f"{resdir}/offline-{name}.csv") 50 | # get token_per_second_per_gpu 51 | json_res["offline_throughput_real_trace"][name] = df["token_per_second_per_gpu"][0].astype(float) 52 | 53 | for name in ["splitwise", "lmsys", "sharegpt"]: 54 | # list all files start with online-{name}- and end with .csv 55 | files = [f for f in os.listdir(resdir) if f.startswith(f"online-{name}-") and f.endswith(".csv")] 56 | for file in files: 57 | df = pd.read_csv(f"{resdir}/{file}") 58 | request_rate = int(file.split("-")[2].split(".")[0]) 59 | json_res["online_throughput"][name]["request_rate"].append(request_rate) 60 | json_res["online_throughput"][name]["normalized_latency"].append(df["average_normalize_latency"][0]) 61 | 62 | print(json_res) 63 | # save json_res to a file 64 | json.dump(json_res, open(f"{resdir}/pllm.json", 'w')) 65 | -------------------------------------------------------------------------------- /pipeline/eval/run_all.sh: -------------------------------------------------------------------------------- 1 | current_dir=$(pwd) 2 | 3 | # get all folders start with "eval-" in the current directory 4 | 5 | cd ../../datasets/ 6 | ./gen.sh 7 | cd $current_dir 8 | 9 | eval_folders=$(find . -maxdepth 1 -type d -name "eval-*") 10 | 11 | for eval_folder in ${eval_folders[@]}; do 12 | echo "Running $eval_folder" 13 | cd $eval_folder 14 | 15 | python run.py --trace_base ../../../datasets/traces --executor_base ../../utils/ 16 | 17 | cd $current_dir 18 | done 19 | 20 | python baseline_data.py . 21 | python summarize.py 22 | python plot_all.py -------------------------------------------------------------------------------- /pipeline/include/computeBound.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | #include "gemmShape.cuh" 8 | #include "pipeline.h" 9 | #include "networkManager.cuh" 10 | #include "tensorManager.cuh" 11 | 12 | 13 | #include "config.h" 14 | #include "vortexData.cuh" 15 | 16 | extern std::vector outputs; 17 | extern std::vector aggregated_output; 18 | 19 | 20 | class Worker { 21 | public: 22 | enum class PipelineType 23 | { 24 | PLLM, 25 | NONOVERLAP, 26 | NANOBATCH, 27 | PLLMOFFLOAD, 28 | KQVBIAS, 29 | NONOVERLAP_KQVBIAS, 30 | NANOBATCH_KQVBIAS, 31 | LOCAL, 32 | NON_OVERLAP_LOCAL, 33 | NANOBATCH_LOCAL, 34 | }; 35 | static PipelineType PipeTy; 36 | 37 | private: 38 | int rank; 39 | int nranks; 40 | int vnranks; 41 | vortexInitData* input; 42 | vortexOutputData* output; 43 | std::unique_ptr pipeline; 44 | std::unique_ptr thread; 45 | void thread_entry(); 46 | 47 | public: 48 | Worker(int rank, 49 | int nranks, 50 | int vnranks, 51 | vortexInitData* input, 52 | vortexOutputData* output) 53 | : rank(rank) 54 | , nranks(nranks) 55 | , vnranks(vnranks) 56 | , input(input) 57 | , output(output) { } 58 | void init(); 59 | void as_thread(int core); 60 | void join() { if (thread) thread->join(); } 61 | void run_pipeline(); 62 | void run_update(vortexUpdateData* update_data) { 63 | vortexUpdateData& gpu_update_data = TensorManager::getInstance().update_data_to_gpu(*update_data, rank); 64 | pipeline->update(&gpu_update_data); 65 | } 66 | void run_config(vortexConfigData* config_data) { 67 | pipeline->config(config_data); 68 | } 69 | vortexOutputData* getOutput() { 70 | return output; 71 | } 72 | ~Worker() { if (thread) thread->join(); } 73 | }; 74 | 75 | 76 | void run(); 77 | // vnranks >= nranks. virtualized ranks will not touch any GPU resources, but will take use random data to participate in collective communication buffers. 78 | void init(int nranks, int vnranks, std::vector& input, std::vector& output, Worker::PipelineType pipeTy = Worker::PipelineType::PLLM); 79 | inline void init(int nranks, std::vector& input, std::vector& output, Worker::PipelineType pipeTy = Worker::PipelineType::PLLM) { 80 | init(nranks, nranks, input, output, pipeTy); 81 | } 82 | 83 | void update(int nranks, std::vector& update); 84 | void finalize(); 85 | void run_async(); 86 | void run_async_wait(); 87 | void config(int nranks, std::vector& config); -------------------------------------------------------------------------------- /pipeline/include/cutlassGemmWrapper.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlassGemmWrapperImpl.cuh" 4 | using ColumnMajor = cutlass::layout::ColumnMajor; 5 | using RowMajor = cutlass::layout::RowMajor; -------------------------------------------------------------------------------- /pipeline/include/eventManager.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "helper.h" 4 | #include 5 | 6 | class EventManager { 7 | public: 8 | enum EVENT_NAME 9 | { 10 | GEMM_TIMING_START = 0, 11 | GEMV_TIMING_START, 12 | NET_TIMING_START, 13 | GEMV_TIMING_END, 14 | GEMM_TIMING_END, 15 | GEMM_TIMING_JOIN, 16 | O1_FINISH, 17 | AG_O1_START, 18 | AG_O1_FINISH, 19 | O2_FINISH, 20 | AG_O2_FINISH, 21 | AG_O2_START, 22 | UG1_FINISH, 23 | D1_FINISH, 24 | UG2_FINISH, 25 | D2_FINISH, 26 | AG_D1_FINISH, 27 | KQV1_FINISH, 28 | KQV1_ROPE_START, 29 | KQV2_FINISH, 30 | KQV2_ROPE_START, 31 | KQV3_FINISH, 32 | KQV3_ROPE_START, 33 | KQV4_FINISH, 34 | KQV4_ROPE_START, 35 | GEMV1_FINISH, 36 | GEMV2_FINISH, 37 | GEMV3_FINISH, 38 | GEMV4_FINISH, 39 | AR1_FINISH, 40 | AR2_FINISH, 41 | AG1_GEMV_FINISH, 42 | AG2_GEMV_FINISH, 43 | CAPTURE_GEMM_START, 44 | CAPTURE_GEMV_END, 45 | CAPTURE_NET_END, 46 | LN_MODEL1_FINISH, 47 | LN_MODEL2_FINISH, 48 | LOGITS1_FINISH, 49 | LOGITS2_FINISH, 50 | AG_LOGITS1_FINISH, 51 | AG_LOGITS2_FINISH, 52 | NUM 53 | }; 54 | constexpr static int NUM_EVENTS = NUM + 1; 55 | std::vector events; 56 | 57 | EventManager() 58 | : events(NUM_EVENTS) { 59 | for(int i = 0; i < NUM_EVENTS; i++) { 60 | CUDA_CHECK(cudaEventCreate(&events[i])); 61 | } 62 | } 63 | 64 | ~EventManager() { 65 | for(int i = 0; i < NUM_EVENTS; i++) { 66 | CUDA_CHECK(cudaEventDestroy(events[i])); 67 | } 68 | } 69 | }; -------------------------------------------------------------------------------- /pipeline/include/gemmFactory.cuh: -------------------------------------------------------------------------------- 1 | #include "cutlassGemmWrapper.cuh" 2 | #include 3 | BaseGEMMWrapper* generateGEMM(std::string tag); 4 | -------------------------------------------------------------------------------- /pipeline/include/gemmShape.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cutlassGemmWrapper.cuh" 3 | #include 4 | 5 | // The canonical name for cutlassGemmWrapper template parameters: 6 | // cta_m, cta_n, cta_k, warp_m, warp_n, warp_k, split_k, stages, A_major, B_major, O_major 7 | // static constexpr std::array gemmConfig = { 8 | // "128_128_32_64_64_32_3_5_RowMajor_RowMajor_RowMajor", // O1 9 | // "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", // O2 10 | // "128_128_32_64_64_32_3_4_RowMajor_RowMajor_RowMajor", // UG1 11 | // "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", // D1 12 | // "128_256_32_64_64_32_1_3_RowMajor_RowMajor_RowMajor", // UG2 13 | // "128_128_32_64_64_32_1_4_RowMajor_RowMajor_RowMajor", // D2 14 | // "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", // KQV1 15 | // "128_128_32_64_64_32_2_5_RowMajor_RowMajor_RowMajor", // KQV2 16 | // "128_128_32_64_64_32_1_5_RowMajor_RowMajor_RowMajor", // KQV3 17 | // "128_64_64_64_32_64_2_3_RowMajor_RowMajor_RowMajor", // KQV4 18 | // "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor", // LOGITS1 19 | // "128_256_32_64_64_32_2_3_RowMajor_RowMajor_RowMajor" // LOGITS2 20 | // }; 21 | 22 | // enum class GEMM_NAME { 23 | // O1=0, 24 | // O2, 25 | // UG1, 26 | // D1, 27 | // UG2, 28 | // D2, 29 | // KQV1, 30 | // KQV2, 31 | // KQV3, 32 | // KQV4, 33 | // LOGITS1, 34 | // LOGITS2, 35 | // NUM 36 | // }; 37 | 38 | // constexpr int gemmNum = static_cast(GEMM_NAME::NUM); 39 | // constexpr int gemvNum = 4; -------------------------------------------------------------------------------- /pipeline/include/gemvConfig.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | // constexpr int GEMV_BATCH_SIZE[] = {0,0,0,0}; 5 | // constexpr int GEMV_BLOCK_NUM[] = {10,10,10,10}; -------------------------------------------------------------------------------- /pipeline/include/gemvDependency.cuh: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | 3 | #include 4 | 5 | 6 | __global__ void setReadyKernel(int * flag, int batch); 7 | 8 | __global__ void clearReadyKernel(int * flag); 9 | 10 | __global__ void waitReadyKernel(int * flag, int desired_batch); 11 | 12 | class gemvDependency { 13 | public: 14 | int * device_KQV_ready; 15 | int * device_GEMV_ready; 16 | // Constructor 17 | gemvDependency() { 18 | cudaMalloc(&device_KQV_ready, sizeof(int)); 19 | cudaMalloc(&device_GEMV_ready, sizeof(int)); 20 | cudaMemset(device_KQV_ready, 0, sizeof(int)); 21 | cudaMemset(device_GEMV_ready, 0, sizeof(int)); 22 | } 23 | 24 | // Destructor 25 | ~gemvDependency() { 26 | cudaFree(device_KQV_ready); 27 | cudaFree(device_GEMV_ready); 28 | } 29 | 30 | // Method to clear all flags 31 | void clearAll(cudaStream_t stream) { 32 | clearReadyKernel<<<1, 1, 0, stream>>>(device_KQV_ready); 33 | clearReadyKernel<<<1, 1, 0, stream>>>(device_GEMV_ready); 34 | } 35 | 36 | void incCounter(int* counter, int num, cudaStream_t stream) { 37 | setReadyKernel<<<1, 1, 0, stream>>>(counter, num); 38 | } 39 | 40 | // Method to block until GEMV is ready 41 | void blockUntilGEMVReady(cudaStream_t stream, int desired_batch) { 42 | waitReadyKernel<<<1, 1, 0, stream>>>(device_GEMV_ready, desired_batch); 43 | } 44 | 45 | private: 46 | // Disallow copying and assignment 47 | gemvDependency(const gemvDependency&) = delete; 48 | gemvDependency& operator=(const gemvDependency&) = delete; 49 | 50 | 51 | }; -------------------------------------------------------------------------------- /pipeline/include/networkManager.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "comm.h" 4 | #include "config.h" 5 | #include "vortexData.cuh" 6 | #include 7 | 8 | class SimpleThreadSync { 9 | // implement simple thread synchronization methods for c++ std::thread 10 | std::mutex mtx; 11 | std::condition_variable cv; 12 | int count; 13 | int total; 14 | 15 | public: 16 | SimpleThreadSync(int total) 17 | : total(total) 18 | , count(0) { } 19 | void barrier() { 20 | std::unique_lock lck(mtx); 21 | count++; 22 | if(count == total) { 23 | count = 0; 24 | cv.notify_all(); 25 | } else { 26 | cv.wait(lck); 27 | } 28 | } 29 | 30 | std::mutex& getMutex() { 31 | return mtx; 32 | } 33 | }; 34 | 35 | enum class WorkerOp 36 | { 37 | STOP, 38 | RUN, 39 | UPDATE, 40 | CONFIG, 41 | }; 42 | 43 | struct SharedState { 44 | mscclpp::UniqueId uniqueId; 45 | WorkerOp op; 46 | std::vector * updates_ptr; 47 | std::vector * config_ptr; 48 | }; 49 | 50 | #ifdef ENABLE_MPI 51 | class NetworkManager 52 | { 53 | public: 54 | int nranks; 55 | int rank; 56 | bool initialized; 57 | void init(int argc, char** argv); 58 | void finalize(); 59 | 60 | }; 61 | 62 | extern std::shared_ptr netmgr; 63 | #else 64 | extern SharedState shared_state; 65 | // sync between nranks workers 66 | extern std::unique_ptr worker_sync; 67 | // sync between nranks workers and the management/main thread 68 | extern std::unique_ptr global_sync; 69 | #endif 70 | 71 | -------------------------------------------------------------------------------- /pipeline/include/offloadKernel.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "cutlass/cutlass.h" 4 | #include 5 | #include "cuda_fp16.h" 6 | #include "config.h" 7 | 8 | 9 | __global__ void moveKVcacheKernel(int finished_req_num, int32_t * finished_index, 10 | int32_t* kv_indptr, int32_t* kv_indices, half* output, half* kv_data, int page_mem_size, bool host_to_gpu = true); -------------------------------------------------------------------------------- /pipeline/include/sleep.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | __global__ void cudaSleep(int us); -------------------------------------------------------------------------------- /pipeline/src/gemvDependency.cu: -------------------------------------------------------------------------------- 1 | #include "gemvDependency.cuh" 2 | 3 | __global__ void setReadyKernel(int * flag, int batch) { 4 | // use atomic add 5 | atomicAdd(flag, batch); 6 | } 7 | 8 | __global__ void clearReadyKernel(int * flag) { 9 | // set to 0 10 | int k = atomicExch(flag, 0); 11 | // printf("clearReadyKernel: %d\n", k); 12 | } 13 | 14 | __global__ void waitReadyKernel(int * flag, int desired_batch) { 15 | int t =0; 16 | t = atomicAdd(flag, 0); 17 | while (t < desired_batch) { 18 | // printf("batch %d is not ready\n", t); 19 | t = atomicAdd(flag, 0); 20 | } 21 | // printf("batch %d is ready\n", t); 22 | } -------------------------------------------------------------------------------- /pipeline/src/generate-gemm/.gitignore: -------------------------------------------------------------------------------- 1 | *.gen 2 | *.cu 3 | *.cuh 4 | -------------------------------------------------------------------------------- /pipeline/src/generate-gemm/Makefile: -------------------------------------------------------------------------------- 1 | GENERATED := gemmFactory.cu cutlassGemmExternDeclearation.cuh 2 | all: $(GENERATED) 3 | $(GENERATED): genGEMM.py gemmFactory.in 4 | rm -f *.cu *.cuh 5 | python3 genGEMM.py 6 | -------------------------------------------------------------------------------- /pipeline/src/generate-gemm/gemmFactory.in: -------------------------------------------------------------------------------- 1 | #include "gemmFactory.cuh" 2 | 3 | #include 4 | 5 | #include "cutlassGemmExternDeclearation.cuh" 6 | 7 | std::unordered_map GEMMGenMap = { 8 | @map_entries@ 9 | }; 10 | BaseGEMMWrapper* generateGEMM(std::string tag) { 11 | auto it = GEMMGenMap.find(tag); 12 | if (it == GEMMGenMap.end()) 13 | return nullptr; 14 | else 15 | return it->second(); 16 | } 17 | -------------------------------------------------------------------------------- /pipeline/src/networkManager.cu: -------------------------------------------------------------------------------- 1 | #include "networkManager.cuh" 2 | #include "config.h" 3 | #include 4 | 5 | #ifdef ENABLE_MPI 6 | std::shared_ptr netmgr; 7 | 8 | void NetworkManager::init(int argc, char** argv) { 9 | #ifdef ENABLE_NETWORK 10 | // Initialize the MPI environment 11 | MPI_Init(&argc, &argv); 12 | // Get the number of processes 13 | MPI_Comm_size(MPI_COMM_WORLD, &nranks); 14 | // Get the rank of the process 15 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 16 | 17 | spdlog::info("Hello world from rank {} out of {} ranks", rank, nranks); 18 | 19 | // Print off a hello world message 20 | std::cout << "Hello world from rank " << rank << " out of " << nranks << " ranks" << std::endl; 21 | 22 | initialized = true; 23 | #endif 24 | } 25 | 26 | void NetworkManager::finalize() { 27 | #ifdef ENABLE_NETWORK 28 | if (initialized) { 29 | MPI_Finalize(); 30 | } 31 | #endif 32 | } 33 | #else // ENABLE_MPI 34 | 35 | SharedState shared_state; 36 | std::unique_ptr worker_sync; 37 | std::unique_ptr global_sync; 38 | 39 | #endif // ENABLE_MPI 40 | -------------------------------------------------------------------------------- /pipeline/src/offloadKernel.cu: -------------------------------------------------------------------------------- 1 | #include "offloadKernel.cuh" 2 | #include 3 | 4 | __device__ void pageCopy(half* input, half* output, int page_mem_size){ 5 | 6 | int copyIter = page_mem_size / sizeof(float4) / blockDim.x; 7 | // printf("copyIter: %d\n", copyIter); 8 | float4* input4 = (float4*)input; 9 | float4* output4 = (float4*)output; 10 | 11 | for (int i = 0; i < copyIter; i++){ 12 | output4[i * blockDim.x + threadIdx.x] = input4[i * blockDim.x + threadIdx.x]; 13 | } 14 | } 15 | 16 | __global__ void moveKVcacheKernel(int finished_req_num, int32_t * finished_index, 17 | int32_t* kv_indptr, int32_t* kv_indices, half* host_ptr, half* kv_data, int page_mem_size, bool host_to_gpu){ 18 | page_mem_size /= sizeof(half); 19 | for (int i = 0; i < finished_req_num; i++){ 20 | int idx = finished_index[i]; 21 | int start = kv_indptr[idx]; 22 | int end = kv_indptr[idx + 1]; 23 | 24 | for (int j = start + blockIdx.x; j < end; j += gridDim.x){ 25 | int page_idx = kv_indices[j]; 26 | half* page = kv_data + page_idx * page_mem_size; 27 | half* host_page = host_ptr + j * page_mem_size; 28 | // printf("page_idx: %d\n", page_idx); 29 | if (host_to_gpu) 30 | pageCopy(host_page, page, page_mem_size); 31 | else 32 | pageCopy(page, host_page, page_mem_size); 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /pipeline/src/run.sh: -------------------------------------------------------------------------------- 1 | mpirun -np 4 --allow-run-as-root sh -c "nsys profile --force-overwrite true --stat=true -o output_\$OMPI_COMM_WORLD_RANK.prof ../build/test_compute > output_\$OMPI_COMM_WORLD_RANK.txt 2>error_\$OMPI_COMM_WORLD_RANK.txt" -------------------------------------------------------------------------------- /pipeline/src/sleep.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void cudaSleep(int us) { 6 | auto start = cuda::std::chrono::high_resolution_clock::now(); 7 | while (cuda::std::chrono::duration_cast(cuda::std::chrono::high_resolution_clock::now() - start).count() < us); 8 | { 9 | 10 | } 11 | } -------------------------------------------------------------------------------- /pipeline/src/tensorLogger.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "spdlog/spdlog.h" 4 | #include "tensor.cuh" 5 | #include "tensorLogger.cuh" 6 | #include 7 | 8 | 9 | -------------------------------------------------------------------------------- /pipeline/src/test_dual.cu: -------------------------------------------------------------------------------- 1 | #include "dualWrapper.cuh" 2 | #include 3 | #include "spdlog/sinks/basic_file_sink.h" 4 | #include 5 | 6 | int main() { 7 | DualWrapper<128, 128, 32, 64, 64, 32, 1, 5, cutlass::layout::RowMajor, cutlass::layout::RowMajor, cutlass::layout::RowMajor> dw; 8 | int M = 128; 9 | int N = 128; 10 | int K = 128; 11 | 12 | dw.set_shape(M, N, K); 13 | cutlass::half_t *host_tensors[7]; 14 | for (int i = 0; i < 7; i++) { 15 | host_tensors[i] = new cutlass::half_t[M*N]; 16 | for (int j = 0; j < M; j++) { 17 | for (int k = 0; k < N; k++) { 18 | host_tensors[i][j*N+k] = cutlass::half_t(float((j+k))/20/128); 19 | 20 | } 21 | } 22 | } 23 | 24 | cutlass::half_t *device_tensors[7]; 25 | for (int i = 0; i < 7; i++) { 26 | cudaMalloc(&device_tensors[i], M*N*sizeof(cutlass::half_t)); 27 | cudaMemcpy(device_tensors[i], host_tensors[i], M*N*sizeof(cutlass::half_t), cudaMemcpyHostToDevice); 28 | } 29 | vortexWeight b1, b2; 30 | b1.ptr = (half* )device_tensors[1]; 31 | b1.N = N; 32 | b1.K = K; 33 | b2.ptr = (half* )device_tensors[2]; 34 | b2.N = N; 35 | b2.K = K; 36 | 37 | dw.set_weight(b1,b2); 38 | 39 | 40 | pllmTensor a = pllmTensor(device_tensors[0], M, K, PllmLayout::ROW_MAJOR); 41 | pllmTensor c = pllmTensor(device_tensors[3], M, N, PllmLayout::ROW_MAJOR); 42 | pllmTensor d0 = pllmTensor(device_tensors[4], M, N, PllmLayout::ROW_MAJOR); 43 | pllmTensor d1 = pllmTensor(device_tensors[5], M, N, PllmLayout::ROW_MAJOR); 44 | pllmTensor d2 = pllmTensor(device_tensors[6], M, N, PllmLayout::ROW_MAJOR); 45 | 46 | dw.setA(a); 47 | dw.setC(c); 48 | dw.setD(d0, d1, d2); 49 | dw.init(); 50 | dw.set_weight(b1,b2); 51 | dw.setStream(0); 52 | std::string private_file_name = "dual.txt"; 53 | auto private_logger = spdlog::basic_logger_mt("private_logger", private_file_name, true); 54 | dw.run().log(private_logger); 55 | cudaDeviceSynchronize(); 56 | 57 | // copy back d0, d1, d2 58 | for (int i = 4; i < 7; i++) { 59 | cudaMemcpy(host_tensors[i], device_tensors[i], M*N*sizeof(cutlass::half_t), cudaMemcpyDeviceToHost); 60 | } 61 | 62 | for (int i = 4; i < 7; i++) { 63 | for (int j = 0; j < M; j++) { 64 | for (int k = 0; k < N; k++) { 65 | std::cout << host_tensors[i][j*N+k] << " "; 66 | } 67 | std::cout << std::endl; 68 | } 69 | std::cout << std::endl; 70 | } 71 | 72 | return 0; 73 | } -------------------------------------------------------------------------------- /pipeline/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | *.log 3 | *.csv 4 | *.req_* 5 | *.schedule 6 | -------------------------------------------------------------------------------- /pipeline/utils/gen_req.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | prompt = sys.argv[1] 4 | decode_len = int(sys.argv[2]) 5 | request_rate = int(sys.argv[3]) 6 | output_name = sys.argv[4] 7 | prefill_len = -1 8 | 9 | # get first word of prompt 10 | first_word = prompt.split(' ')[0] 11 | 12 | if request_rate == 0: 13 | request_interval = 0 14 | else: 15 | request_interval = 1 / request_rate 16 | 17 | with open(f"{output_name}", "w") as f: 18 | for i in range(100000): 19 | f.write(f"{i},{prefill_len},{decode_len},{request_interval*i}, {prompt}\n") -------------------------------------------------------------------------------- /pipeline/utils/listToCSV.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | import argparse 4 | 5 | def list_to_csv(input_file, output_file): 6 | with open(input_file, 'r') as f: 7 | data = f.read() 8 | 9 | # Use regex to split data into cycles (both regular and skip cycles) 10 | cycle_pattern = r'(------------------ (?:Cycle|Skip Cycle) (\d+) ------------------)' 11 | cycles = re.split(cycle_pattern, data)[1:] 12 | 13 | rows = [] 14 | for i in range(0, len(cycles), 3): 15 | cycle_type = cycles[i].strip() # Cycle type (Cycle or Skip Cycle) 16 | cycle_number = cycles[i + 1].strip() # Cycle number 17 | fields = cycles[i + 2].strip().splitlines() # Fields within the cycle 18 | 19 | # Determine if it's a skip cycle 20 | is_skip_cycle = 'Skip' in cycle_type 21 | 22 | # Parse fields and values 23 | field_values = {'Cycle': cycle_number, 'is_skip_cycle': is_skip_cycle} 24 | for field in fields: 25 | key, value = map(str.strip, field.split(':', 1)) 26 | field_values[key] = value 27 | 28 | rows.append(field_values) 29 | 30 | # Get all possible field names 31 | all_fields = set() 32 | for row in rows: 33 | all_fields.update(row.keys()) 34 | 35 | # Write to CSV 36 | with open(output_file, 'w', newline='') as csvfile: 37 | writer = csv.DictWriter(csvfile, fieldnames=sorted(all_fields)) 38 | writer.writeheader() 39 | for row in rows: 40 | writer.writerow(row) 41 | 42 | def main(): 43 | # Initialize the argument parser 44 | parser = argparse.ArgumentParser(description='Parse cycle logs and convert them to CSV.') 45 | parser.add_argument('input_file', type=str, help='Path to the input log file') 46 | parser.add_argument('output_file', type=str, help='Path to the output CSV file') 47 | 48 | # Parse the arguments 49 | args = parser.parse_args() 50 | 51 | # Call the parser function with input and output paths 52 | list_to_csv(args.input_file, args.output_file) 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /pipeline/utils/plotSchedule.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import argparse 4 | 5 | def plotSchedule(schedule_csv): 6 | 7 | df = pd.read_csv(schedule_csv) 8 | # schedule_csv = "_"+schedule_csv 9 | plt.plot(df['Cycle'], df['memory usage %'], marker='o', markersize=1, linestyle='-', color='b') 10 | plt.title('Memory Usage % Across Cycles') 11 | plt.xlabel('Cycle') 12 | plt.ylabel('Memory Usage %') 13 | plt.grid(True) 14 | plt.savefig(f'{schedule_csv}.memory_usage.png') 15 | 16 | # plot decode effective bsz 17 | plt.figure() 18 | plt.plot(df['Cycle'], df['decode effective bsz'], marker='o', linestyle='-', color='g') 19 | plt.title('Decode Effective Batch Size Across Cycles') 20 | plt.xlabel('Cycle') 21 | plt.ylabel('Decode Effective Batch Size') 22 | plt.grid(True) 23 | plt.savefig(f'{schedule_csv}.decode_effective_bsz.png') 24 | 25 | # plot prefill effective bsz 26 | plt.figure() 27 | plt.plot(df['Cycle'], df['prefill effective bsz'], marker='o', linestyle='-', color='r') 28 | plt.title('Prefill Effective Batch Size Across Cycles') 29 | plt.xlabel('Cycle') 30 | plt.ylabel('Prefill Effective Batch Size') 31 | plt.grid(True) 32 | plt.savefig(f'{schedule_csv}.prefill_effective_bsz.png') 33 | 34 | # plot both decode and prefill effective bsz in one plot 35 | plt.figure() 36 | plt.plot(df['Cycle'], df['decode effective bsz'], marker='o', linestyle='-', color='g', label='Decode Effective Batch Size') 37 | plt.plot(df['Cycle'], df['prefill effective bsz'], marker='o', linestyle='-', color='r', label='Prefill Effective Batch Size') 38 | plt.title('Decode and Prefill Effective Batch Size Across Cycles') 39 | plt.xlabel('Cycle') 40 | plt.ylabel('Effective Batch Size') 41 | plt.legend() 42 | plt.grid(True) 43 | plt.savefig(f'{schedule_csv}.effective_bsz.png') 44 | 45 | 46 | if __name__ == '__main__': 47 | # plotSchedule('512_large_kv_cache.schedule.csv') 48 | # plotSchedule('512.schedule.csv') 49 | # plotSchedule('740.schedule.csv') 50 | arg_parser = argparse.ArgumentParser() 51 | arg_parser.add_argument("--schedule_csv", type=str, help="path to schedule csv file") 52 | args = arg_parser.parse_args() 53 | plotSchedule(args.schedule_csv) -------------------------------------------------------------------------------- /pipeline/utils/plot_trend.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | filename = sys.argv[1] 4 | 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | df = pd.read_csv(filename, header=None, names=['prefill', 'decode']) 9 | 10 | df["sum"] = df["prefill"] + df["decode"] 11 | 12 | df = df.drop(columns=["decode"]) 13 | 14 | df.plot() 15 | 16 | # set y limit to 0 to 2100 17 | plt.ylim(0, 2100) 18 | 19 | plt.savefig(f"{filename}.png") -------------------------------------------------------------------------------- /pipeline/utils/request_info.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | class NewRequestInfo: 3 | """ 4 | Request info for incoming request 5 | NOTE (Yilong): add support for offloading / onloading KV-Cache 6 | """ 7 | req_idx: int 8 | prompt: list[int] 9 | output_len : int 10 | start_time: float 11 | 12 | class NewRequestQueue: 13 | """ 14 | Thread-safe request deque as request buffer. 15 | """ 16 | def __init__(self) -> None: 17 | self._queue = deque() 18 | 19 | @property 20 | def size(self) -> int: 21 | return len(self._queue) 22 | 23 | def put(self, req: NewRequestInfo): 24 | self._queue.append(req) 25 | 26 | def get(self) -> NewRequestInfo: 27 | assert len(self._queue) > 0, "Queue is empty" 28 | return self._queue.popleft() 29 | 30 | def clear(self) -> None: 31 | self._queue.clear() 32 | 33 | class FlyRequestInfo: 34 | """ 35 | Request info for on-the-fly request 36 | NOTE (Yilong): add support for offloading / onloading KV-Cache 37 | """ 38 | 39 | def __init__(self, req_idx: int, input: list[int], output: list[int], prompt: list[int], request_comein_time: float, 40 | chunked_prefill: bool, kv_cache, encode_latency: float, 41 | decode_start_at: float, decode_latency: float, output_len: int, input_len: int): 42 | self.req_idx = req_idx 43 | self.input = input 44 | self.output = output 45 | self.prompt = prompt 46 | self.chunked_prefill = chunked_prefill 47 | self.kv_cache = kv_cache 48 | self.encode_latency = encode_latency 49 | self.decode_start_at = decode_start_at 50 | self.decode_latency = decode_latency 51 | self.output_len = output_len 52 | self.input_len = input_len 53 | self.request_comein_time = request_comein_time 54 | 55 | def finish(self) -> None: 56 | self.kv_cache.release() 57 | 58 | -------------------------------------------------------------------------------- /serve.sh: -------------------------------------------------------------------------------- 1 | current_dir=$(pwd) 2 | parentdir="$(dirname "$current_dir")" 3 | mkdir -p $parentdir/hf 4 | 5 | export HF_HOME=$parentdir/hf 6 | HF_HOME=$parentdir/hf 7 | #check if token is cached 8 | if [ ! -f $HF_HOME/token ]; then 9 | echo "Please login to Hugging Face to cache your token." 10 | huggingface-cli login 11 | fi 12 | 13 | 14 | cd pipeline/utils 15 | read -e -p "Prompt [default: The University of Washington is located]: " -i "The University of Washington is located" prompt 16 | read -e -p "Decode length [default: 100]: " -i "100" decode_length 17 | read -e -p "Output file [default: trace.csv]: " -i "trace.csv" output_file 18 | 19 | # Prompt for model selection and map the selection to a specific model path 20 | echo "Select model:" 21 | echo "1) llama2-70B" 22 | echo "2) llama3-70B" 23 | echo "3) llama3.1-70B" 24 | echo "4) llama3-8B" 25 | echo "5) llama3.1-8B" 26 | echo "6) Qwen2-72B" 27 | 28 | read -p "Enter the number corresponding to your model choice: " model_choice 29 | 30 | case $model_choice in 31 | 1) 32 | config_path="../config_all/llama2-70B/2048.json" 33 | ;; 34 | 2) 35 | config_path="../config_all/llama3-70B/2048.json" 36 | ;; 37 | 3) 38 | config_path="../config_all/llama3.1-70B/2048.json" 39 | ;; 40 | 4) 41 | config_path="../config_all/llama3-8B/correct_40G/1024.json" 42 | ;; 43 | 5) 44 | config_path="../config_all/llama3.1-8B/1024.json" 45 | ;; 46 | 6) 47 | config_path="../config_all/qwen2-72B/2048.json" 48 | ;; 49 | *) 50 | echo "Invalid choice. Defaulting to llama3-8B." 51 | config_path="../config_all/llama3-8B/1024.json" 52 | ;; 53 | esac 54 | 55 | 56 | python gen_req.py "${prompt}" ${decode_length} 0 ${output_file} 57 | 58 | python serve_8B.py -t ${output_file} -c ${config_path} -r 200 59 | output_file_base="${output_file%.csv}" 60 | cat ${output_file_base}.req_words -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | # init submodule dependencies 2 | git submodule init 3 | git submodule update 4 | 5 | # install dependencies 6 | apt update 7 | apt install python3 8 | pip3 install cmake 9 | apt install libopenmpi-dev 10 | apt install wget 11 | pip install torch 12 | apt install libspdlog-dev 13 | apt-get install libglib2.0-0 14 | apt install pigz 15 | pip install wget 16 | pip install pandas 17 | pip install seaborn 18 | pip install mypy 19 | pip install transformers 20 | pip install --upgrade pydantic 21 | pip install sentencepiece 22 | apt-get install git-lfs 23 | apt-get install python3-pybind11 24 | apt-get install nlohmann-json3-dev 25 | 26 | # fix pybind header compile error 27 | sed -i '446,486s/^/\/\//' /usr/include/pybind11/detail/type_caster_base.h 28 | 29 | # install cmake 3.29.0 30 | cd .. 31 | wget https://github.com/Kitware/CMake/releases/download/v3.29.0-rc2/cmake-3.29.0-rc2-linux-x86_64.sh 32 | chmod +x cmake-3.29.0-rc2-linux-x86_64.sh 33 | ./cmake-3.29.0-rc2-linux-x86_64.sh --prefix=/usr/local --exclude-subdir 34 | 35 | 36 | # install nsight 37 | NSIGHT="NsightSystems-linux-cli-public-2023.4.1.97-3355750.deb" 38 | if [[ ! -f "$NSIGHT" ]]; then 39 | wget https://developer.download.nvidia.com/devtools/nsight-systems/$NSIGHT 40 | dpkg -i ./$NSIGHT 41 | fi 42 | 43 | cd Nanoflow 44 | 45 | 46 | 47 | 48 | # build mscclpp 49 | cd 3rdparty/mscclpp 50 | git reset --hard cdaf3aea3d767ba65dd3b08984d76bd50615f92e 51 | 52 | cd ../.. 53 | for repo in mscclpp; do 54 | cat 3rdparty/patches/${repo}/*.patch | patch -p1 -d 3rdparty/${repo} 55 | done 56 | 57 | cd 3rdparty/mscclpp 58 | 59 | mkdir -p build 60 | cd build 61 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/mscclpp -DBUILD_PYTHON_BINDINGS=OFF .. 62 | make -j mscclpp mscclpp_static 63 | make install/fast 64 | cd ../../../ 65 | 66 | # fix spdlog v1.14.0 + cuda 12.1 compatibility bug 67 | for repo in spdlog; do 68 | cat 3rdparty/patches/${repo}/*.patch | patch -p1 -d 3rdparty/${repo} 69 | done 70 | 71 | 72 | cd pipeline 73 | 74 | # download and trace visualizer 75 | cd utils 76 | curl -LO https://get.perfetto.dev/trace_processor 77 | chmod +x ./trace_processor 78 | cd .. 79 | 80 | # generate gemm lib 81 | cd src/generate-gemm 82 | python3 genGEMM.py 83 | cd ../../ 84 | 85 | # build pllm 86 | mkdir -p build 87 | cd build 88 | cmake .. 89 | make -j 256 90 | 91 | # set up libstdc++.so.6 directory 92 | 93 | export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH 94 | 95 | ./test_compute ../config_all/llama3-8B/1024.json 96 | 97 | --------------------------------------------------------------------------------