├── .gitignore ├── CMakeLists.txt ├── README.md ├── docker ├── Dockerfile.llama.cpp ├── Dockerfile.transformers └── requirements.txt ├── eval ├── demo-ds02-batch │ └── transformers-app.py ├── demo-ds03-batch │ └── transformers-app.py ├── demo-nn-lru │ ├── args.py │ ├── plot.ipynb │ ├── runner.py │ └── transformers-deepseek-moe.py └── demo-simulate │ ├── output.csv │ ├── plot.ipynb │ ├── runner.py │ └── transformers-deepseek-moe.py ├── examples ├── full-eval │ ├── plot.ipynb │ ├── runner.py │ └── transformers-app.py ├── profiler │ └── transformers-app.py ├── small-demo-gptq │ ├── transformers-accelerate.py │ ├── transformers-app.py │ └── transformers-um.py ├── small-demo-hqq │ ├── transformers-accelerate.py │ ├── transformers-app.py │ └── transformers-um.py └── small-demo │ ├── Makefile │ ├── transformers-accelerate.py │ ├── transformers-app.py │ └── transformers-um.py ├── install.md ├── pyproject.toml ├── setup.py ├── src ├── cpp_worker │ ├── adapter-llama.cpp │ ├── adapter-llama.hpp │ ├── adapter.cpp │ ├── cache.cpp │ ├── cache.hpp │ ├── cuda_helper_func.cu │ ├── logging.cc │ ├── logging.hpp │ ├── model_loader.cpp │ ├── model_loader.hpp │ ├── predictor.cpp │ ├── predictor.hpp │ ├── prefetcher.cpp │ ├── prefetcher.hpp │ ├── profiler.cpp │ ├── profiler.hpp │ ├── utils.cpp │ ├── utils.hpp │ ├── worker.cpp │ └── worker.hpp └── sparse_llm_cache │ ├── __init__.py │ ├── expert_cache_inject_accelerate.py │ ├── oracle_cache_policy.py │ ├── prefetch │ ├── expert_prefetch.py │ └── oracle_prefetch_policy.py │ ├── profile │ └── profiler.py │ └── utils │ ├── __init__.py │ ├── common_metas.py │ ├── filter.py │ ├── hooks.py │ └── runner_util.py └── tests ├── batch-generate-small-prompt ├── runner.py └── transformers-app.py ├── batch-generate ├── runner.py └── transformers-app.py └── min-cache ├── runner.py └── transformers-app.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | **/*.egg-info 3 | **/*.so 4 | **/*.clangd 5 | 6 | run-logs* 7 | .cache/clangd -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | SITE_NAME(HOSTNAME) 3 | 4 | project(sparce_llm_cache_library CXX CUDA) 5 | 6 | set(CMAKE_BUILD_TYPE "Release") 7 | # set(CMAKE_BUILD_TYPE "Debug") 8 | 9 | if (CMAKE_BUILD_TYPE STREQUAL "Debug") 10 | set(COMMON_FLAGS " -O0 -g ") 11 | endif() 12 | 13 | if (CMAKE_BUILD_TYPE STREQUAL "Release") 14 | set(COMMON_FLAGS " -O3 -g ") 15 | endif() 16 | 17 | set(COMMON_FLAGS " ${COMMON_FLAGS} -Wno-sign-compare -Wno-attributes ") 18 | 19 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 20 | 21 | set(CMAKE_CXX_STANDARD 17) 22 | # no-as-needed is necessary because gurobi95 is not directly used by moe_cache 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS} -fPIC -Wall -fopenmp -march=native -D_GLIBCXX_USE_CXX11_ABI=0 -DUNIT_TEST -DPSTL_USE_PARALLEL_POLICIES=0 -Wl,--no-as-needed") 24 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${COMMON_FLAGS} --compiler-options '-fopenmp -fPIC ' -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80") 25 | message(DEBUG "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") 26 | message(DEBUG "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") 27 | include_directories( 28 | ${PROJECT_SOURCE_DIR}/src/cpp_worker 29 | /usr/include/python3.10 30 | /usr/local/lib/python3.10/dist-packages/torch/include 31 | /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include 32 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 33 | ${PROJECT_SOURCE_DIR}/3rdparty/json/single_include 34 | ) 35 | 36 | # source scripts already adds necessary path to LD path 37 | link_directories( 38 | # $ENV{GUROBI_HOME}/lib 39 | # $ENV{CONDA_PREFIX}/lib 40 | # /opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8 41 | /usr/local/cuda/lib64 42 | ) 43 | 44 | 45 | file(GLOB_RECURSE CACHE_SRC 46 | src/cpp_worker/*.cc 47 | src/cpp_worker/*.cpp 48 | src/cpp_worker/*.cu) 49 | 50 | # build samgraph library 51 | add_library(moe_cache 52 | SHARED 53 | ${CACHE_SRC} 54 | ) 55 | target_link_libraries(moe_cache 56 | PUBLIC 57 | cuda 58 | cudart 59 | nvToolsExt 60 | python3.10 61 | ) 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MoE Cache 2 | 3 | ## Install 4 | 5 | Please refer to [./install.md](./install.md) 6 | 7 | ## Prepare the model (transformers) 8 | 9 | ```bash 10 | # Download model from huggingface. It will be stored at ~/.cache/huggingface/hub 11 | huggingface-cli download deepseek-ai/deepseek-moe-16b-chat 12 | huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat 13 | # Patch model implementation to use our cache 14 | cp -r /code/sparse-llm-cache-scripts/huggingface-modules/modules /root/.cache/huggingface/ 15 | # optionally download tokenizers for two large model 16 | huggingface-cli download Qwen/Qwen2-57B-A14B-Instruct tokenizer.json tokenizer_config.json vocab.json special_tokens_map.json 17 | huggingface-cli download mistralai/Mixtral-8x7B-Instruct-v0.1 special_tokens_map.json tokenizer.json tokenizer.model tokenizer_config.json 18 | ``` 19 | 20 | ## Prepare the model (llama.cpp) 21 | 22 | ```bash 23 | # convert huggingface safetensors to gguf 24 | python3 /code/llama.cpp/convert_hf_to_gguf.py --outtype f16 --outfile 25 | ##### for example 26 | mkdir -p /code/huggingface-gguf/DeepSeek-V2-Lite-Chat/f16 27 | python3 /code/llama.cpp/convert_hf_to_gguf.py --outtype f16 --outfile /code/huggingface-gguf/DeepSeek-V2-Lite-Chat/f16/main.gguf /code/huggingface/hub/models--deepseek-ai--DeepSeek-V2-Lite-Chat/snapshots/85864749cd611b4353ce1decdb286193298f64c7 28 | 29 | # convert gguf to quantized version 30 | /code/llama.cpp/build/bin/llama-quantize 31 | ##### for example 32 | mkdir -p /code/huggingface-gguf/DeepSeek-V2-Lite-Chat/q4_k_m 33 | cd /code/huggingface-gguf/DeepSeek-V2-Lite-Chat 34 | /app/build/bin/llama-quantize ./f16/main.gguf ./q4_k_m/main.gguf q4_k_m 35 | ``` 36 | 37 | ## Prepare dataset 38 | 39 | ```bash 40 | cd /code/sparse-llm-cache-scripts/dataset/chatgpt-prompts-small 41 | bash ./get.sh 42 | python3 to_prompt_list.py 43 | ``` 44 | 45 | ## Run (llama.cpp) 46 | 47 | ```bash 48 | # for example 49 | /code/llama.cpp/build/bin/llama-parallel --model /code/huggingface-gguf/deepseek-moe-16b-chat/q4_k_m/main.gguf \ 50 | --file /code/sparse-llm-cache-scripts/dataset/chatgpt-prompts-small/prompt_list.txt \ 51 | --delay-escape --sequences 3 --ctx-size 512 --gpu-layers 100 --predict 128 --no-cont-batching \ 52 | --moe_cache 1 --num_predict 6 --moe_cache_rate 0.375 --reorder_experts True --early_preempt True \ 53 | --pred_model_path /code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits 54 | ``` 55 | 56 | ## Run (transformers) 57 | 58 | ### Run small example 59 | 60 | ```bash 61 | cd /code/sparse-llm-cache/examples/small-demo 62 | # run baseline 63 | make run-base 64 | # run ours 65 | make run-ours 66 | ``` 67 | 68 | ### Run full eval 69 | 70 | ```bash 71 | cd /code/sparse-llm-cache/examples/full-eval 72 | # run both baseline and ours under various cache rate. This could take hours. 73 | python runner.py run 74 | # parse results to output.csv 75 | python runner.py parse 76 | ``` 77 | 78 | Figures can be plotted using `/code/sparse-llm-cache/examples/full-eval/plot.ipynb` -------------------------------------------------------------------------------- /docker/Dockerfile.llama.cpp: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | 3 | # This needs to generally match the container host's environment. 4 | ARG CUDA_VERSION=12.4.1 5 | 6 | # Target the CUDA build image 7 | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} 8 | 9 | FROM ${BASE_CUDA_DEV_CONTAINER} AS build 10 | 11 | # Unless otherwise specified, we make a fat build. 12 | # ARG CUDA_DOCKER_ARCH=89 13 | ENV LANG=en_US.UTF-8 14 | ENV LC_ALL=C.UTF-8 15 | 16 | RUN apt-get update && \ 17 | apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 zsh curl wget vim ccache 18 | 19 | COPY requirements.txt requirements.txt 20 | 21 | RUN pip install --upgrade pip setuptools wheel pandas \ 22 | && pip install -r requirements.txt \ 23 | && pip install flash_attn optimum ninja 24 | 25 | WORKDIR /code 26 | 27 | # COPY . . 28 | 29 | # Set nvcc architecture 30 | ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} 31 | # Enable CUDA 32 | ENV GGML_CUDA=1 33 | # Enable cURL 34 | ENV LLAMA_CURL=1 35 | 36 | # RUN make -j$(nproc) 37 | 38 | # ENTRYPOINT ["/app/.devops/tools.sh"] 39 | -------------------------------------------------------------------------------- /docker/Dockerfile.transformers: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as cuda-devel-env 2 | FROM ghcr.io/huggingface/text-generation-inference:2.2.0 as base 3 | 4 | ENV LANG=en_US.UTF-8 5 | ENV LC_ALL=C.UTF-8 6 | 7 | RUN chmod 1777 /tmp 8 | 9 | COPY --from=cuda-devel-env /usr/local/cuda-12.1 /usr/local/cuda-12.1 10 | 11 | RUN pip install ninja --no-cache-dir 12 | RUN pip install optimum==1.21.2 gekko==1.2.1 rouge==1.0.1 --no-deps 13 | 14 | RUN apt-get update && apt-get install -y --no-install-recommends \ 15 | vim wget git curl \ 16 | && rm -rf /var/lib/apt/lists/* 17 | 18 | ENV HUGGINGFACE_HUB_CACHE="/root/.cache/huggingface/hub" 19 | 20 | ENTRYPOINT [""] 21 | 22 | ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs 23 | ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64/stubs 24 | 25 | # Final image 26 | FROM base 27 | 28 | RUN mkdir /code 29 | 30 | WORKDIR /code -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | # These requirements include all dependencies for all top-level python scripts 2 | # for llama.cpp. Avoid adding packages here directly. 3 | # 4 | # Package versions must stay compatible across all top-level python scripts. 5 | # 6 | 7 | numpy~=1.26.4 8 | sentencepiece~=0.2.0 9 | transformers>=4.40.1,<5.0.0 10 | gguf>=0.1.0 11 | protobuf>=4.21.0,<5.0.0 12 | 13 | --extra-index-url https://download.pytorch.org/whl/cu124 14 | torch~=2.4.0 -------------------------------------------------------------------------------- /eval/demo-ds02-batch/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 4 | # os.environ['CUDA_VISIBLE_DEVICES'] = '3' 5 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'TRACE' 6 | # os.environ['SPARSE_CACHE_ENABLE_TRACE'] = '1' 7 | os.environ['HF_HUB_OFFLINE'] = "1" 8 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 9 | 10 | from transformers.utils import logging 11 | import torch 12 | from transformers import AutoModelForCausalLM, AutoTokenizer 13 | 14 | import sparse_llm_cache 15 | import time 16 | 17 | from sparse_llm_cache.utils.runner_util import parse_args 18 | cache_configs = parse_args() 19 | for k, v in cache_configs.items(): print(k,v) 20 | 21 | sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 22 | 23 | print("loading model...") 24 | load_time_start = time.time() 25 | logging.disable_progress_bar() 26 | model_id = cache_configs['model_id'] 27 | torch_dtype = 'auto' 28 | if 'Mixtral' in model_id or 'GPTQ' in model_id: 29 | torch_dtype = None 30 | print("dtype is", torch_dtype) 31 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 32 | tokenizer.pad_token = tokenizer.eos_token 33 | model = AutoModelForCausalLM.from_pretrained( 34 | model_id, 35 | # torch_dtype=torch.float16, 36 | torch_dtype=torch_dtype, 37 | # use_flash_attention_2=True, 38 | local_files_only=True, 39 | device_map=0, 40 | trust_remote_code=True, 41 | revision=cache_configs['model_revision'], 42 | ) 43 | if 'Mixtral' in model_id: 44 | import auto_gptq 45 | model = auto_gptq.exllama_set_max_input_length(model, 7200) 46 | print("loading model...done", time.time() - load_time_start) 47 | 48 | def gen_long(text, do_print=False, max_new_tokens=100): 49 | inputs = tokenizer(text, return_tensors="pt").input_ids.to(f"cuda") 50 | outputs = model.generate(inputs, max_new_tokens=max_new_tokens) 51 | output_str = tokenizer.batch_decode(outputs) 52 | if do_print: 53 | print(output_str, flush=True) 54 | return inputs[0].nelement() 55 | 56 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 57 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 58 | input_len = inputs['input_ids'].shape[1] 59 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 60 | output_len = outputs.shape[1] - input_len 61 | outputs = outputs[:, input_len:] 62 | output_len = outputs.shape[1] 63 | output_str = tokenizer.batch_decode(outputs) 64 | if do_print: 65 | print(text_list, output_str, flush=True) 66 | return input_len, output_len 67 | 68 | 69 | # import json 70 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json" 71 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 72 | # dataset_path = "/code/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 73 | # with open(dataset_path) as f: 74 | # dataset = json.load(f) 75 | 76 | # # # Filter out the conversations with less than 2 turns. 77 | # # dataset = [data for data in dataset if len(data["conversations"]) >= 2] 78 | # # # Only keep the first two turns of each conversation. 79 | # # dataset = [(data["conversations"][0]["value"], 80 | # # data["conversations"][1]["value"]) for data in dataset] 81 | 82 | # # Tokenize the prompts and completions. 83 | # prompts = [prompt for prompt, _ in dataset] 84 | # # completions = [completion for _, completion in dataset] 85 | # # completion_token_ids = tokenizer(completions).input_ids 86 | # # tokenized_dataset = [] 87 | # # for i in range(len(dataset)): 88 | # # output_len = len(completion_token_ids[i]) 89 | # # tokenized_dataset.append((prompts[i], output_len)) 90 | 91 | dataset_path = f'/code/moe/datasets/{cache_configs["dataset"]}/prompt_list.pt' 92 | print(dataset_path) 93 | prompts = torch.load(dataset_path) 94 | 95 | from torch.utils.data import Dataset 96 | class StringListDataset(Dataset): 97 | def __init__(self, string_list): 98 | self.string_list = string_list 99 | def __len__(self): 100 | return len(self.string_list) 101 | def __getitem__(self, idx): 102 | return self.string_list[idx] 103 | ds = StringListDataset(prompts) 104 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 105 | 106 | for seq_id,text_list in enumerate(dl): 107 | if seq_id > cache_configs['max_num_batch']: 108 | print("max_num_batch reached") 109 | break 110 | start_time = time.time() 111 | try: 112 | input_len, output_len = gen_batch(text_list, max_new_tokens=128, do_print=True) 113 | except Exception as e: 114 | print(f"error at seq {seq_id}") 115 | print(str(e)) 116 | print(input_len, output_len, time.time() - start_time, flush=True) -------------------------------------------------------------------------------- /eval/demo-ds03-batch/transformers-app.py: -------------------------------------------------------------------------------- 1 | ../demo-ds02-batch/transformers-app.py -------------------------------------------------------------------------------- /eval/demo-nn-lru/args.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import argparse 3 | 4 | class CustomBooleanAction(argparse.Action): 5 | def __init__(self, option_strings, dest, nargs=None, **kwargs): 6 | if nargs is not None: 7 | raise ValueError("nargs not allowed") 8 | super().__init__(option_strings, dest, **kwargs) 9 | def __call__(self, parser, namespace, values, option_string=None): 10 | values = str(values).lower() 11 | if values in ['true', '1', 'on']: 12 | setattr(namespace, self.dest, True) 13 | elif values in ['false', '0', 'off']: 14 | setattr(namespace, self.dest, False) 15 | else: 16 | raise ValueError("invalid boolean value {}".format(self.values)) 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--num_predict_expert_per_layer", type=int) 21 | parser.add_argument("--cache_rate", type=float) 22 | parser.add_argument("--cache_len", type=int, default=None) 23 | parser.add_argument("--max_prefetch_layer_distance", type=int, default=-1) 24 | parser.add_argument( "--per_layer_cache", action=CustomBooleanAction, default=True) 25 | parser.add_argument( "--enable_per_layer_cache", action="store_true", dest="per_layer_cache", default=True) 26 | parser.add_argument("--disable_per_layer_cache", action="store_false", dest="per_layer_cache", default=True) 27 | parser.add_argument("--cache_policy", type=str, choices=["lru", "fifo", "nn"], default="lru") 28 | parser.add_argument( "--reorder_experts", action=CustomBooleanAction, default=True) 29 | parser.add_argument( "--enable_reorder_experts", action="store_true", dest="reorder_experts", default=True) 30 | parser.add_argument("--disable_reorder_experts", action="store_false", dest="reorder_experts", default=True) 31 | parser.add_argument("--promote_hit_in_prefetch", action=CustomBooleanAction, default=True) 32 | parser.add_argument("--early_preempt", action=CustomBooleanAction, default=True) 33 | parser.add_argument( "--trace_event", action=CustomBooleanAction, default=False) 34 | parser.add_argument( "--enable_trace_event", action="store_true", dest="trace_event", default=False) 35 | parser.add_argument("--disable_trace_event", action="store_false", dest="trace_event", default=False) 36 | parser.add_argument( "--module_trace_event", action=CustomBooleanAction, default=False) 37 | parser.add_argument( "--enable_module_trace_event", action="store_true", dest="module_trace_event", default=False) 38 | parser.add_argument("--disable_module_trace_event", action="store_false", dest="module_trace_event", default=False) 39 | args = parser.parse_args() 40 | return vars(args) 41 | -------------------------------------------------------------------------------- /eval/demo-nn-lru/runner.py: -------------------------------------------------------------------------------- 1 | from eval_helper.config import RunConfigBase, OptionCmdLine, OptionEnv, OptionApp, ConfigList, ResultFloat 2 | 3 | my_app = RunConfigBase() 4 | my_app.app = OptionApp('python transformers-deepseek-moe.py', 'deepseek-moe', 'deepseek-moe') 5 | my_app.logdir = 'run-logs' 6 | my_app.config_dict = { 7 | 'num_predict_expert_per_layer' : OptionCmdLine('num_predict_expert_per_layer', readable_name='predict', logname='predict'), 8 | 'cache_rate' : OptionCmdLine('cache_rate'), 9 | 'cache_policy' : OptionCmdLine('cache_policy', readable_name='policy', logname='policy'), 10 | 'per_layer_cache' : OptionCmdLine('per_layer_cache'), 11 | 'reorder_experts' : OptionCmdLine('reorder_experts'), 12 | 'early_preempt' : OptionCmdLine('early_preempt'), 13 | 'max_prefetch_layer_distance' : OptionCmdLine('max_prefetch_layer_distance'), 14 | 'log_level' : OptionEnv('SPARSE_CACHE_LOG_LEVEL', readable_name=False, logname=False), 15 | } 16 | my_app.result_dict = { 17 | 'decode_stage_forward_time' : ResultFloat('decode_stage_forward_time'), 18 | 'prefill_stage_forward_time' : ResultFloat('prefill_stage_forward_time'), 19 | 'decode_stage_hit_rate' : ResultFloat('decode_stage_hit_rate'), 20 | 'prefill_stage_hit_rate' : ResultFloat('prefill_stage_hit_rate'), 21 | } 22 | my_app['per_layer_cache'] = True 23 | base_cfg_list = ConfigList.MakeList(my_app) 24 | 25 | full_list = ConfigList.Empty() 26 | 27 | ### options to control: prefetch, reorder, early_preempt 28 | 29 | full_list.concat(base_cfg_list.copy() 30 | .override('cache_policy', ['lru', 'nn']) 31 | .override('per_layer_cache', [True]) 32 | .override('reorder_experts', [False]) 33 | .override('num_predict_expert_per_layer', [0]) 34 | .override('early_preempt', [False]) 35 | .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 36 | ) 37 | full_list.concat(base_cfg_list.copy() 38 | .override('cache_policy', ['lru', 'nn']) 39 | .override('per_layer_cache', [True]) 40 | .override('reorder_experts', [True]) 41 | .override('num_predict_expert_per_layer', [prefetch_len for prefetch_len in [6]]) 42 | .override('early_preempt', [True]) 43 | .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 44 | ) 45 | 46 | if __name__ == '__main__': 47 | from eval_helper.runner_args import parse_args 48 | args = parse_args() 49 | if 'run' in args.commands: 50 | full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only) 51 | if 'parse' in args.commands: 52 | full_list.override('logdir', [args.logdir]) 53 | full_list.parse() 54 | full_list.to_pdframe([ 55 | 'cache_policy', 56 | 'per_layer_cache', 57 | 'reorder_experts', 58 | 'early_preempt', 59 | 'num_predict_expert_per_layer', 60 | 'cache_rate', 61 | 'decode_stage_hit_rate', 62 | 'prefill_stage_hit_rate', 63 | 'decode_stage_forward_time', 64 | 'prefill_stage_forward_time', 65 | ]).to_csv(args.parse_output, index=False) 66 | -------------------------------------------------------------------------------- /eval/demo-nn-lru/transformers-deepseek-moe.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 5 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'TRACE' 6 | # os.environ['SPARSE_CACHE_ENABLE_TRACE'] = '1' 7 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 8 | 9 | from transformers.utils import logging 10 | import torch 11 | from transformers import AutoModelForCausalLM, AutoTokenizer 12 | 13 | import sparse_llm_cache 14 | import time 15 | 16 | from args import parse_args 17 | cache_configs = parse_args() 18 | for k, v in cache_configs.items(): print(k,v) 19 | 20 | print("loading model...") 21 | logging.disable_progress_bar() 22 | model_id = "deepseek-ai/deepseek-moe-16b-chat" 23 | tokenizer = AutoTokenizer.from_pretrained(model_id) 24 | model = AutoModelForCausalLM.from_pretrained( 25 | model_id, 26 | torch_dtype=torch.float16, 27 | # use_flash_attention_2=True, 28 | local_files_only=True, 29 | device_map='cpu', 30 | trust_remote_code=True 31 | ) 32 | print("loading model...done") 33 | 34 | 35 | sparse_llm_cache.utils.inject_model( 36 | model, 37 | **cache_configs, 38 | pin_memory = True, 39 | enable_model_timer=True, 40 | ) 41 | 42 | model.to('cuda') 43 | 44 | def gen_long(text, do_print=False, max_new_tokens=1000): 45 | inputs = tokenizer(text, return_tensors="pt").input_ids.to(f"cuda") 46 | outputs = model.generate(inputs, max_new_tokens=max_new_tokens) 47 | output_str = tokenizer.batch_decode(outputs) 48 | if do_print: 49 | print(output_str) 50 | return inputs[0].nelement() 51 | 52 | 53 | # %% 54 | import json 55 | # dataset_path = "/nvme/songxiaoniu/moe/datasets/vllm-benchmark/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json" 56 | dataset_path = "/nvme/songxiaoniu/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 57 | with open(dataset_path) as f: 58 | dataset = json.load(f) 59 | 60 | # # Filter out the conversations with less than 2 turns. 61 | # dataset = [data for data in dataset if len(data["conversations"]) >= 2] 62 | # # Only keep the first two turns of each conversation. 63 | # dataset = [(data["conversations"][0]["value"], 64 | # data["conversations"][1]["value"]) for data in dataset] 65 | 66 | # Tokenize the prompts and completions. 67 | prompts = [prompt for prompt, _ in dataset] 68 | completions = [completion for _, completion in dataset] 69 | completion_token_ids = tokenizer(completions).input_ids 70 | tokenized_dataset = [] 71 | for i in range(len(dataset)): 72 | output_len = len(completion_token_ids[i]) 73 | tokenized_dataset.append((prompts[i], output_len)) 74 | 75 | 76 | for seq_id in range(len(tokenized_dataset)): 77 | start_time = time.time() 78 | try: 79 | text, output_len = tokenized_dataset[seq_id] 80 | prompt_len = gen_long(text, max_new_tokens=min(output_len, 500), do_print=False) 81 | # prompt_len = gen_long(text, max_new_tokens=output_len, do_print=False) 82 | except Exception as e: 83 | print(f"error at seq {seq_id}") 84 | print(str(e)) 85 | print(time.time() - start_time) 86 | -------------------------------------------------------------------------------- /eval/demo-simulate/runner.py: -------------------------------------------------------------------------------- 1 | from eval_helper.config import RunConfigBase, OptionCmdLine, OptionEnv, OptionApp, ConfigList, ResultFloat 2 | 3 | my_app = RunConfigBase() 4 | my_app.app = OptionApp('python transformers-deepseek-moe.py', 'deepseek-moe', 'deepseek-moe') 5 | my_app.logdir = 'run-logs' 6 | my_app.config_dict = { 7 | 'num_predict_expert_per_layer' : OptionCmdLine('num_predict_expert_per_layer', readable_name='predict', logname='predict'), 8 | 'cache_rate' : OptionCmdLine('cache_rate'), 9 | 'cache_policy' : OptionCmdLine('cache_policy', readable_name='policy', logname='policy'), 10 | 'per_layer_cache' : OptionCmdLine('per_layer_cache'), 11 | 'reorder_experts' : OptionCmdLine('reorder_experts'), 12 | 'early_preempt' : OptionCmdLine('early_preempt'), 13 | 'max_prefetch_layer_distance' : OptionCmdLine('max_prefetch_layer_distance'), 14 | 'predict_input_mode' : OptionCmdLine('predict_input_mode'), 15 | 'cache_trace_path' : OptionCmdLine('cache_trace_path', readable_name=False, logname=False), 16 | 'predictor_model_path' : OptionCmdLine('predictor_model_path', readable_name=False, logname=False), 17 | 'log_level' : OptionEnv('SPARSE_CACHE_LOG_LEVEL', readable_name=False, logname=False), 18 | } 19 | my_app.result_dict = { 20 | 'decode_stage_forward_time' : ResultFloat('decode_stage_forward_time'), 21 | 'prefill_stage_forward_time' : ResultFloat('prefill_stage_forward_time'), 22 | 'decode_stage_hit_rate' : ResultFloat('decode_stage_hit_rate'), 23 | 'prefill_stage_hit_rate' : ResultFloat('prefill_stage_hit_rate'), 24 | 'decode_stage_prefetch_hit_rate' : ResultFloat('decode_stage_prefetch_hit_rate'), 25 | 'prefill_stage_prefetch_hit_rate' : ResultFloat('prefill_stage_prefetch_hit_rate'), 26 | 'decode_stage_ready_rate' : ResultFloat('decode_stage_ready_rate'), 27 | 'prefill_stage_ready_rate' : ResultFloat('prefill_stage_ready_rate'), 28 | 'legacy_decode_stage_hit_rate' : ResultFloat('legacy_decode_stage_hit_rate'), 29 | 'legacy_prefill_stage_hit_rate' : ResultFloat('legacy_prefill_stage_hit_rate'), 30 | } 31 | my_app['per_layer_cache'] = True 32 | base_cfg_list = ConfigList.MakeList(my_app) 33 | 34 | full_list = ConfigList.Empty() 35 | 36 | ### options to control: prefetch, reorder, early_preempt 37 | 38 | full_list.concat(base_cfg_list.copy() 39 | .override('cache_policy', ['min', 'lru']) 40 | .override('per_layer_cache', [True, False]) 41 | .override('cache_trace_path', ['/nvme/songxiaoniu/moe/moe-traces/deepseek-moe-sharegpt-0412.json']) 42 | .override('reorder_experts', [False, True]) 43 | .override('num_predict_expert_per_layer', [0]) 44 | .override('early_preempt', [False]) 45 | .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 46 | ) 47 | 48 | # full_list.concat(base_cfg_list.copy() 49 | # .override('cache_policy', ['nn']) 50 | # .override('per_layer_cache', [True, False]) 51 | # .override('cache_trace_path', ['/nvme/songxiaoniu/moe/moe-traces/deepseek-moe-sharegpt-0412.json']) 52 | # .override('reorder_experts', [False, True]) 53 | # .override('num_predict_expert_per_layer', [0]) 54 | # .override('early_preempt', [False]) 55 | # .override('predict_input_mode', ['one_token']) 56 | # .override('predictor_model_path', ['/nvme/songxiaoniu/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat.pt']) 57 | # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 58 | # ) 59 | # full_list.concat(base_cfg_list.copy() 60 | # .override('cache_policy', ['nn']) 61 | # .override('per_layer_cache', [True, False]) 62 | # .override('cache_trace_path', ['/nvme/songxiaoniu/moe/moe-traces/deepseek-moe-sharegpt-0412.json']) 63 | # .override('reorder_experts', [False, True]) 64 | # .override('num_predict_expert_per_layer', [0]) 65 | # .override('early_preempt', [False]) 66 | # .override('predict_input_mode', ['decode_cumsum']) 67 | # .override('predictor_model_path', ['/nvme/songxiaoniu/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat-next-reuse.pt']) 68 | # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 69 | # ) 70 | full_list.concat(base_cfg_list.copy() 71 | .override('cache_policy', ['nn']) 72 | .override('per_layer_cache', [True, False]) 73 | .override('cache_trace_path', ['/nvme/songxiaoniu/moe/moe-traces/deepseek-moe-sharegpt-0412.json']) 74 | .override('reorder_experts', [False, True]) 75 | .override('num_predict_expert_per_layer', [0]) 76 | .override('early_preempt', [False]) 77 | .override('predict_input_mode', ['weighted_decode_cumsum']) 78 | .override('predictor_model_path', ['/nvme/songxiaoniu/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat-next-10-freq-weighted-sum.pt']) 79 | .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 80 | ) 81 | # full_list.concat(base_cfg_list.copy() 82 | # .override('cache_policy', ['lru', 'nn']) 83 | # .override('per_layer_cache', [True]) 84 | # .override('reorder_experts', [True]) 85 | # .override('num_predict_expert_per_layer', [prefetch_len for prefetch_len in [6]]) 86 | # .override('early_preempt', [True]) 87 | # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 48, 64]]) 88 | # ) 89 | 90 | if __name__ == '__main__': 91 | from eval_helper.runner_args import parse_args 92 | args = parse_args() 93 | if 'run' in args.commands: 94 | full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only, parallel_workers=args.parallel_workers) 95 | if 'parse' in args.commands: 96 | full_list.override('logdir', [args.logdir]) 97 | full_list.parse() 98 | full_list.to_pdframe([ 99 | 'cache_policy', 100 | 'per_layer_cache', 101 | 'reorder_experts', 102 | 'predict_input_mode', 103 | # 'early_preempt', 104 | # 'num_predict_expert_per_layer', 105 | 'cache_rate', 106 | 'decode_stage_hit_rate', 107 | 'prefill_stage_hit_rate', 108 | 'decode_stage_ready_rate', 109 | 'prefill_stage_ready_rate', 110 | 'legacy_decode_stage_hit_rate', 111 | 'legacy_prefill_stage_hit_rate', 112 | ]).to_csv(args.parse_output, index=False) 113 | -------------------------------------------------------------------------------- /eval/demo-simulate/transformers-deepseek-moe.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import os 3 | 4 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 6 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'TRACE' 7 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'DEBUG' 8 | # os.environ['SPARSE_CACHE_ENABLE_TRACE'] = '1' 9 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 10 | 11 | import torch 12 | 13 | import sparse_llm_cache 14 | import time 15 | 16 | from sparse_llm_cache.utils.runner_util import parse_args 17 | cache_configs = parse_args() 18 | for k, v in cache_configs.items(): print(k,v) 19 | 20 | print("loading model...") 21 | 22 | @dataclass 23 | class SimulateConfig: 24 | _name_or_path : str = None 25 | 26 | 27 | class SimulateExpert(torch.nn.Module): 28 | def __init__(self): 29 | super().__init__() 30 | # self.fc = torch.nn.Linear(1, 1) 31 | self.fc = torch.nn.Linear(0, 0, bias=False) 32 | 33 | def forward(self, inputs): 34 | return inputs 35 | 36 | class SimulateMoeLayer(torch.nn.Module): 37 | def __init__(self, num_experts): 38 | super().__init__() 39 | self.num_experts = num_experts 40 | self.experts = torch.nn.ModuleList([SimulateExpert() for _ in range(num_experts)]) 41 | def report_experts(self, experts): 42 | return experts 43 | def forward(self, inputs): 44 | expert_ids = inputs 45 | expert_ids = self.report_experts(expert_ids) 46 | for eid in expert_ids: 47 | expert = self.experts[eid] 48 | expert(eid) 49 | return inputs 50 | 51 | class SimulateModel(torch.nn.Module): 52 | def __init__(self, num_layers, per_layer_experts): 53 | super().__init__() 54 | self.config = SimulateConfig(_name_or_path = "deepseek-ai/deepseek-moe-16b-chat-simulate") 55 | self.num_layers = num_layers 56 | self.per_layer_experts = per_layer_experts 57 | self.layers = torch.nn.ModuleList([SimulateMoeLayer(per_layer_experts) for _ in range(num_layers)]) 58 | self.trace = None 59 | 60 | def forward(self, inputs): 61 | experts_per_layer = inputs 62 | for i, layer in enumerate(self.layers): 63 | layer(torch.asarray(experts_per_layer[i], dtype=torch.int64)) 64 | return inputs 65 | 66 | def generate(self, seq_id, max_new_tokens = None): 67 | prompt_len = self.trace[str(seq_id)]['prompt_len'] 68 | rply_len = len(self.trace[str(seq_id)]['0']) - prompt_len 69 | if max_new_tokens: 70 | rply_len = min(rply_len, max_new_tokens) 71 | 72 | experts_per_layer = [] 73 | 74 | for decoder_moe_layer_id in range(self.num_layers): 75 | dedup_expert_in_cur_layer = [] 76 | for prompt_token_idx in range(prompt_len): 77 | dedup_expert_in_cur_layer += self.trace[str(seq_id)][str(decoder_moe_layer_id)][str(prompt_token_idx)] 78 | dedup_expert_in_cur_layer = list(set(dedup_expert_in_cur_layer)) 79 | dedup_expert_in_cur_layer.sort() 80 | experts_per_layer.append(dedup_expert_in_cur_layer) 81 | 82 | self.forward(experts_per_layer) 83 | 84 | for rply_token_idx in range(prompt_len, prompt_len + rply_len): 85 | experts_per_layer = [] 86 | for decoder_moe_layer_id in range(self.num_layers): 87 | experts_per_layer.append(self.trace[str(seq_id)][str(decoder_moe_layer_id)][str(rply_token_idx)]) 88 | self.forward(experts_per_layer) 89 | 90 | model = SimulateModel(27, 64) 91 | 92 | fname = "/nvme/songxiaoniu/moe/moe-traces/deepseek-moe-sharegpt-0412.pickle" 93 | import pickle 94 | with open(fname, "rb") as f: 95 | trace = pickle.load(f) 96 | trace : dict 97 | model.trace = trace 98 | 99 | print("loading model...done") 100 | 101 | prefetch_mngr = sparse_llm_cache.utils.inject_model( 102 | model, 103 | **cache_configs, 104 | pin_memory = True, 105 | enable_model_timer=True, 106 | ) 107 | 108 | model.to('cuda') 109 | 110 | def gen_long(seq_id, max_new_tokens=None): 111 | prefetch_mngr.cache.set_cur_seq(seq_id) 112 | outputs = model.generate(seq_id, max_new_tokens=max_new_tokens) 113 | 114 | for seq_id in range(20): 115 | start_time = time.time() 116 | try: 117 | gen_long(seq_id, max_new_tokens=None) 118 | except Exception as e: 119 | print(f"error at seq {seq_id}") 120 | print(str(e)) 121 | print(e.with_traceback()) 122 | print(time.time() - start_time) 123 | -------------------------------------------------------------------------------- /examples/full-eval/plot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "model_id = 'deepseek-ai/deepseek-moe-16b-chat'\n", 14 | "df = pd.read_csv('output.csv')\n", 15 | "df = df.query(f'model_id == \"{model_id}\"')" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# decode time\n", 25 | "x = df.query('num_predict_expert_per_layer == 0').query('reorder_experts == False').query('early_preempt == False')['cache_rate']\n", 26 | "plt.plot(x, df.query('num_predict_expert_per_layer == 0').query('reorder_experts == False').query('early_preempt == False')['decode_stage_forward_time']/1000, color=(1,0,0,1), label='base')\n", 27 | "plt.plot(x, df.query('num_predict_expert_per_layer != 0').query('reorder_experts == True').query('early_preempt == True')['decode_stage_forward_time']/1000, color=(0,0,1,1), label='ours')\n", 28 | "\n", 29 | "plt.title(model_id)\n", 30 | "plt.xlabel('cache rate')\n", 31 | "plt.ylabel('decode stage time per token (ms)')\n", 32 | "plt.ylim(bottom=0)\n", 33 | "plt.grid()\n", 34 | "plt.legend()\n", 35 | "plt.show()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# prefill time\n", 45 | "x = df.query('num_predict_expert_per_layer == 0').query('reorder_experts == False').query('early_preempt == False')['cache_rate']\n", 46 | "plt.plot(x, df.query('num_predict_expert_per_layer == 0').query('reorder_experts == False').query('early_preempt == False')['prefill_stage_forward_time']/1000, color=(1,0,0,1), label='base')\n", 47 | "plt.plot(x, df.query('num_predict_expert_per_layer != 0').query('reorder_experts == True').query('early_preempt == True')['prefill_stage_forward_time']/1000, color=(0,0,1,1), label='ours')\n", 48 | "\n", 49 | "plt.title(model_id)\n", 50 | "plt.xlabel('cache rate')\n", 51 | "plt.ylabel('prefill stage time per token (ms)')\n", 52 | "plt.ylim(bottom=0)\n", 53 | "plt.grid()\n", 54 | "plt.legend()\n", 55 | "plt.show()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# improvement over baseline(decode)\n", 65 | "baseline = df.query('num_predict_expert_per_layer == 0').query('reorder_experts == False').query('early_preempt == False')\n", 66 | "x = baseline['cache_rate']\n", 67 | "ours = df\n", 68 | "\n", 69 | "plt.plot(x, np.ones_like(x), color=(1,0,0,1), label='base-weak2')\n", 70 | "plt.plot(x, np.asarray(baseline['decode_stage_forward_time'])/(ours.query('num_predict_expert_per_layer != 0').query('reorder_experts == True').query('early_preempt == True')['decode_stage_forward_time']), color=(0,0,1,1), label='ours')\n", 71 | "\n", 72 | "plt.title(model_id)\n", 73 | "plt.xlabel('cache rate')\n", 74 | "plt.ylabel('improvement to base-weak2')\n", 75 | "_, top_ylim = plt.gca().get_ylim()\n", 76 | "top_ylim = np.round((top_ylim + 0.1) * 5) / 5\n", 77 | "plt.ylim(bottom=0,top=top_ylim)\n", 78 | "plt.grid()\n", 79 | "plt.legend()\n", 80 | "plt.show()" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "base", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.10.13" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 2 105 | } 106 | -------------------------------------------------------------------------------- /examples/full-eval/runner.py: -------------------------------------------------------------------------------- 1 | from eval_helper.config import RunConfigBase, OptionCmdLine, OptionEnv, OptionApp, ConfigList, ResultFloat 2 | import os 3 | 4 | my_app = RunConfigBase() 5 | my_app.app = OptionApp('python3 transformers-app.py', False, False) 6 | my_app.logdir = 'run-logs' 7 | my_app.config_dict = { 8 | 'model_id' : OptionCmdLine('model_id'), 9 | 'model_revision' : OptionCmdLine('model_revision'), 10 | 'dataset' : OptionCmdLine('dataset'), 11 | 'batch_size' : OptionCmdLine('batch_size'), 12 | 'num_predict_expert_per_layer' : OptionCmdLine('num_predict_expert_per_layer', readable_name='predict', logname='predict'), 13 | 'cache_rate' : OptionCmdLine('cache_rate'), 14 | 'cache_policy' : OptionCmdLine('cache_policy', readable_name='policy', logname='policy'), 15 | 'per_layer_cache' : OptionCmdLine('per_layer_cache', logname=False), 16 | 'reorder_experts' : OptionCmdLine('reorder_experts', logname='reorder'), 17 | 'early_preempt' : OptionCmdLine('early_preempt', logname='early'), 18 | 'predict_input_mode' : OptionCmdLine('predict_input_mode', logname=False), 19 | 'layer_predict_interval' : OptionCmdLine('layer_predict_interval', logname='p_int'), 20 | 'layer_predict_max_window' : OptionCmdLine('layer_predict_max_window', logname='p_win'), 21 | 'layer_predict_use_last_output' : OptionCmdLine('layer_predict_use_last_output', logname='p_last'), 22 | 'predictor_model_path' : OptionCmdLine('predictor_model_path', readable_name=False, logname=False), 23 | 'log_level' : OptionEnv('SPARSE_CACHE_LOG_LEVEL', readable_name=False, logname=False), 24 | } 25 | 26 | my_app.result_dict = { 27 | 'decode_stage_forward_time' : ResultFloat('decode_stage_forward_time'), 28 | 'prefill_stage_forward_time' : ResultFloat('prefill_stage_forward_time'), 29 | 'decode_stage_hit_rate' : ResultFloat('decode_stage_hit_rate'), 30 | 'prefill_stage_hit_rate' : ResultFloat('prefill_stage_hit_rate'), 31 | 'decode_stage_ready_rate' : ResultFloat('decode_stage_ready_rate'), 32 | 'prefill_stage_ready_rate' : ResultFloat('prefill_stage_ready_rate'), 33 | } 34 | my_app['per_layer_cache'] = True 35 | base_cfg_list = ConfigList.MakeList(my_app) 36 | 37 | full_list = ConfigList.Empty() 38 | 39 | ### options to control: prefetch, reorder, early_preempt 40 | 41 | template_cfg_list = (base_cfg_list.copy() 42 | .override('cache_policy', ['lru',]) 43 | .override('batch_size', [1]) 44 | .override('per_layer_cache', [True]) 45 | .override('predict_input_mode', ['moe_layer_logits']) 46 | .override('layer_predict_interval', [1]) 47 | .override('layer_predict_max_window', [3]) 48 | .override('layer_predict_use_last_output', [ 49 | True, 50 | # False, 51 | ]) 52 | .override('dataset', ['chatgpt-prompts-small']) 53 | ) 54 | 55 | full_list.concat(template_cfg_list.copy() 56 | .override('model_id', ['deepseek-ai/deepseek-moe-16b-chat',]) 57 | .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 58 | .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits']) 59 | .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 60 | [0, False, False], ## weak baseline 61 | [6, True, True], ## +p+opt 62 | ]) 63 | ) 64 | 65 | if __name__ == '__main__': 66 | from eval_helper.runner_args import parse_args 67 | args = parse_args() 68 | if 'run' in args.commands: 69 | full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only) 70 | if 'parse' in args.commands: 71 | full_list.override('logdir', [args.logdir]) 72 | full_list.parse() 73 | full_list.to_pdframe([ 74 | 'model_id', 75 | 'batch_size', 76 | 'reorder_experts', 77 | 'early_preempt', 78 | 'num_predict_expert_per_layer', 79 | 'cache_rate', 80 | 'decode_stage_hit_rate', 81 | 'prefill_stage_hit_rate', 82 | 'decode_stage_ready_rate', 83 | 'prefill_stage_ready_rate', 84 | 'decode_stage_forward_time', 85 | 'prefill_stage_forward_time', 86 | ]).to_csv(args.parse_output, index=False) 87 | -------------------------------------------------------------------------------- /examples/full-eval/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | from transformers.utils import logging 7 | import torch 8 | torch.cuda.set_device(0) 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | import sparse_llm_cache 12 | import time 13 | 14 | from sparse_llm_cache.utils.runner_util import parse_args 15 | cache_configs = parse_args() 16 | for k, v in cache_configs.items(): print(k,v) 17 | 18 | sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 19 | 20 | print("loading model...") 21 | load_time_start = time.time() 22 | logging.disable_progress_bar() 23 | model_id = cache_configs['model_id'] 24 | torch_dtype = 'auto' 25 | if 'Mixtral' in model_id or 'GPTQ' in model_id: 26 | torch_dtype = None 27 | print("dtype is", torch_dtype) 28 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 29 | tokenizer.pad_token = tokenizer.eos_token 30 | model = AutoModelForCausalLM.from_pretrained( 31 | model_id, 32 | # torch_dtype=torch.float16, 33 | torch_dtype=torch_dtype, 34 | local_files_only=True, 35 | device_map=0, 36 | trust_remote_code=True, 37 | revision=cache_configs['model_revision'], 38 | ) 39 | if 'Mixtral' in model_id: 40 | import auto_gptq 41 | model = auto_gptq.exllama_set_max_input_length(model, 7200) 42 | print("loading model...done", time.time() - load_time_start) 43 | 44 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 45 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 46 | input_len = inputs['input_ids'].shape[1] 47 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 48 | output_len = outputs.shape[1] - input_len 49 | outputs = outputs[:, input_len:] 50 | output_len = outputs.shape[1] 51 | output_str = tokenizer.batch_decode(outputs) 52 | if do_print: 53 | print(text_list, output_str, flush=True) 54 | return input_len, output_len 55 | 56 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 57 | print(dataset_path) 58 | prompts = torch.load(dataset_path) 59 | 60 | from torch.utils.data import Dataset 61 | class StringListDataset(Dataset): 62 | def __init__(self, string_list): 63 | self.string_list = string_list 64 | def __len__(self): 65 | return len(self.string_list) 66 | def __getitem__(self, idx): 67 | return self.string_list[idx] 68 | ds = StringListDataset(prompts) 69 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 70 | 71 | for seq_id,text_list in enumerate(dl): 72 | if seq_id > cache_configs['max_num_batch']: 73 | print("max_num_batch reached") 74 | break 75 | start_time = time.time() 76 | try: 77 | input_len, output_len = gen_batch(text_list, max_new_tokens=128, do_print=True) 78 | except Exception as e: 79 | print(f"error at seq {seq_id}") 80 | print(str(e)) 81 | print(input_len, output_len, time.time() - start_time, flush=True) -------------------------------------------------------------------------------- /examples/profiler/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | from transformers.utils import logging 7 | import torch 8 | torch.cuda.set_device(0) 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | # import sparse_llm_cache 12 | import time 13 | 14 | # from sparse_llm_cache.utils.runner_util import parse_args 15 | # cache_configs = parse_args() 16 | # for k, v in cache_configs.items(): print(k,v) 17 | 18 | # sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 19 | 20 | print("loading model...") 21 | load_time_start = time.time() 22 | logging.disable_progress_bar() 23 | # model_id = cache_configs['model_id'] 24 | 25 | model_id = "deepseek-ai/deepseek-moe-16b-chat" 26 | torch_dtype = 'auto' 27 | if 'Mixtral' in model_id or 'GPTQ' in model_id: 28 | torch_dtype = None 29 | print("dtype is", torch_dtype) 30 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 31 | tokenizer.pad_token = tokenizer.eos_token 32 | model = AutoModelForCausalLM.from_pretrained( 33 | model_id, 34 | # torch_dtype=torch.float16, 35 | torch_dtype=torch_dtype, 36 | local_files_only=True, 37 | device_map=0, 38 | trust_remote_code=True, 39 | # revision=cache_configs['model_revision'], 40 | ) 41 | if 'Mixtral' in model_id: 42 | import auto_gptq 43 | model = auto_gptq.exllama_set_max_input_length(model, 7200) 44 | print("loading model...done", time.time() - load_time_start) 45 | 46 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 47 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 48 | input_len = inputs['input_ids'].shape[1] 49 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 50 | output_len = outputs.shape[1] - input_len 51 | outputs = outputs[:, input_len:] 52 | output_len = outputs.shape[1] 53 | output_str = tokenizer.batch_decode(outputs) 54 | if do_print: 55 | print(text_list, output_str, flush=True) 56 | return input_len, output_len 57 | 58 | # dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 59 | # print(dataset_path) 60 | # prompts = torch.load(dataset_path) 61 | 62 | # from torch.utils.data import Dataset 63 | # class StringListDataset(Dataset): 64 | # def __init__(self, string_list): 65 | # self.string_list = string_list 66 | # def __len__(self): 67 | # return len(self.string_list) 68 | # def __getitem__(self, idx): 69 | # return self.string_list[idx] 70 | # ds = StringListDataset(prompts) 71 | # dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 72 | 73 | 74 | 75 | from torch.profiler import profile, record_function, ProfilerActivity 76 | # profiler = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True, with_stack=True) 77 | # profiler = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) 78 | # profiler, profile_fname = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_modules=True), "nllb-transformer-trace-module.json" 79 | # profiler, profile_fname = profile(activities=[ProfilerActivity.CUDA], with_stack=True), "nllb-transformer-trace-cuda-only-stack.json" 80 | 81 | _global_profiler : profile = None 82 | # profile_fname = "nllb-transformer-trace-stack-short.json" 83 | 84 | def create_profile(record_shapes=True,profile_memory=True,with_stack=True, with_cpu=True, with_cuda=True): 85 | global _global_profiler 86 | activities = [] 87 | if with_cpu: 88 | activities.append(ProfilerActivity.CPU) 89 | if with_cuda: 90 | activities.append(ProfilerActivity.CUDA) 91 | _global_profiler = profile( 92 | activities=activities, 93 | with_stack=with_stack, 94 | record_shapes=record_shapes, 95 | profile_memory=profile_memory, 96 | ) 97 | 98 | def start_profile() : 99 | global _global_profiler 100 | _global_profiler.__enter__() 101 | 102 | def stop_profile(): 103 | global _global_profiler 104 | _global_profiler.__exit__(None, None, None) 105 | 106 | def export_trace(fname): 107 | _global_profiler.export_chrome_trace(fname) 108 | 109 | 110 | 111 | prompts = ["Introduce yourself"] 112 | 113 | input_len, output_len = gen_batch(prompts, max_new_tokens=10, do_print=True) 114 | input_len, output_len = gen_batch(prompts, max_new_tokens=10, do_print=True) 115 | input_len, output_len = gen_batch(prompts, max_new_tokens=10, do_print=True) 116 | 117 | create_profile(record_shapes=False, profile_memory=False, with_stack=True, with_cpu=True, with_cuda=True) 118 | start_profile() 119 | input_len, output_len = gen_batch(prompts, max_new_tokens=10, do_print=True) 120 | stop_profile() 121 | export_trace('trace-cuda.json') -------------------------------------------------------------------------------- /examples/small-demo-gptq/transformers-accelerate.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True" 6 | 7 | import json 8 | import time 9 | import torch 10 | from transformers.utils import logging 11 | from transformers.generation.utils import TimeProfiler, recursive_attach 12 | import torch 13 | torch.cuda.set_device(0) 14 | from transformers import AutoTokenizer 15 | from auto_gptq import AutoGPTQForCausalLM 16 | 17 | import sparse_llm_cache 18 | sparse_llm_cache.cpp_worker.auto_eat_cuda_memory() 19 | from sparse_llm_cache.utils import repo_folder_name 20 | from accelerate import dispatch_model, infer_auto_device_map 21 | from accelerate.utils import get_max_memory 22 | 23 | def prepare_args(): 24 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 25 | parser = prepare_argparser() 26 | parser.add_argument('--save_dir', type=str, help='By default, it will be inferred from model_id at /') 27 | parser.add_argument('--save_dir_base', type=str, default='/code/gptq-models-4bits', help='Base directory to save the quantized model') 28 | parser.add_argument('--max_gpu_memory', type=float, default=0.95, help='Max gpu memory to use for the model') 29 | parser.add_argument("--backend", type=str, default='EXLLAMA', choices=['TRITONV2', 'EXLLAMA']) 30 | # parser.add_argument('--dtype', type=str, default='auto', help='Compute dtype. Default is auto infer from model config.json') 31 | # parser.add_argument('--backend', type=str, default='aten', choices=['none', 'torch', 'torchao', 'marlin', 'bitblas', 'aten'], help='Backend for inference. Default is aten. Currently aten is the fastest backend.') 32 | cache_configs = parse_args(parser=parser) 33 | return cache_configs 34 | 35 | def load_model(cache_configs): 36 | model_id = cache_configs['model_id'] 37 | save_dir = cache_configs['save_dir'] 38 | if save_dir is None: 39 | save_dir = os.path.join(cache_configs['save_dir_base'], repo_folder_name(model_id)) 40 | 41 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 42 | tokenizer.pad_token = tokenizer.eos_token 43 | 44 | use_tritonv2 = False 45 | disable_exllama = False 46 | disable_exllamav2 = False 47 | 48 | if cache_configs['backend'] == 'TRITONV2': 49 | use_tritonv2 = True 50 | elif cache_configs['backend'] == 'EXLLAMA': 51 | disable_exllama = False 52 | disable_exllamav2 = True 53 | 54 | model = AutoGPTQForCausalLM.from_quantized( 55 | save_dir, 56 | torch_dtype=torch.float16, 57 | local_files_only=True, 58 | device='cpu', 59 | trust_remote_code=True, 60 | revision=cache_configs['model_revision'], 61 | use_tritonv2=use_tritonv2, 62 | disable_exllama = disable_exllama, 63 | disable_exllamav2 = disable_exllamav2, 64 | ) 65 | 66 | device_map = infer_auto_device_map(model, max_memory={0: "6GiB", 'cpu': "120GiB"}) 67 | max_memory = get_max_memory() 68 | max_memory[0] = int(max_memory[0] * cache_configs['max_gpu_memory']) 69 | device_map = infer_auto_device_map(model, max_memory=max_memory) 70 | print(device_map) 71 | model = dispatch_model(model, device_map=device_map, main_device=0, offload_buffers=True) 72 | 73 | time_profiler = TimeProfiler() 74 | recursive_attach(model, time_profiler, '_time_profiler') 75 | 76 | model.eval() 77 | 78 | return model, tokenizer, time_profiler 79 | 80 | def gen_batch(model, tokenizer, text_list, do_print=False, max_new_tokens=100): 81 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 82 | input_len = inputs['input_ids'].shape[1] 83 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 84 | output_len = outputs.shape[1] - input_len 85 | outputs = outputs[:, input_len:] 86 | output_len = outputs.shape[1] 87 | output_str = tokenizer.batch_decode(outputs) 88 | if do_print: 89 | print(text_list, output_str, flush=True) 90 | return input_len, output_len 91 | 92 | def load_prompt_list(cache_configs): 93 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 94 | print(dataset_path) 95 | prompts = torch.load(dataset_path) 96 | 97 | from torch.utils.data import Dataset 98 | class StringListDataset(Dataset): 99 | def __init__(self, string_list): 100 | self.string_list = string_list 101 | def __len__(self): 102 | return len(self.string_list) 103 | def __getitem__(self, idx): 104 | return self.string_list[idx] 105 | ds = StringListDataset(prompts) 106 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 107 | return dl 108 | 109 | def main(cache_configs): 110 | load_time_start = time.time() 111 | model, tokenizer, time_profiler = load_model(cache_configs) 112 | load_model_time = time.time() - load_time_start 113 | print("Loading model...done", load_model_time) 114 | 115 | dl = load_prompt_list(cache_configs) 116 | 117 | eval_time_start = time.time() 118 | for seq_id,text_list in enumerate(dl): 119 | if seq_id >= cache_configs['max_num_batch']: 120 | print("max_num_batch reached") 121 | break 122 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 123 | input_len, output_len = gen_batch(model, tokenizer, text_list, max_new_tokens=128, do_print=True) 124 | eval_time = time.time() - eval_time_start 125 | 126 | time_profiler.log() 127 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 128 | print("load_model_time:", load_model_time) 129 | print("eval_time:", eval_time) 130 | 131 | if __name__ == "__main__": 132 | cache_configs = prepare_args() 133 | main(cache_configs) -------------------------------------------------------------------------------- /examples/small-demo-gptq/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | import json 7 | import time 8 | import torch 9 | from transformers.utils import logging 10 | logging.disable_progress_bar() 11 | from transformers.generation.utils import TimeProfiler, recursive_attach 12 | torch.cuda.set_device(0) 13 | from transformers import AutoTokenizer 14 | from auto_gptq import AutoGPTQForCausalLM 15 | 16 | import sparse_llm_cache 17 | from sparse_llm_cache.utils import repo_folder_name 18 | 19 | import argparse 20 | from accelerate import dispatch_model, infer_auto_device_map 21 | 22 | def prepare_args(): 23 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 24 | parser = prepare_argparser() 25 | parser.add_argument('--save_dir', type=str, help='By default, it will be inferred from model_id at /') 26 | parser.add_argument('--save_dir_base', type=str, default='/code/gptq-models-4bits', help='Base directory to save the quantized model') 27 | parser.add_argument("--backend", type=str, default='EXLLAMA', choices=['TRITONV2', 'EXLLAMA']) 28 | # parser.add_argument('--dtype', type=str, default='auto', help='Compute dtype. Default is auto infer from model config.json') 29 | # parser.add_argument('--backend', type=str, default='aten', choices=['none', 'torch', 'torchao', 'marlin', 'bitblas', 'aten'], help='Backend for inference. Default is aten. Currently aten is the fastest backend.') 30 | cache_configs = parse_args(parser=parser) 31 | return cache_configs 32 | 33 | def load_model(cache_configs): 34 | model_id = cache_configs['model_id'] 35 | save_dir = cache_configs['save_dir'] 36 | if save_dir is None: 37 | save_dir = os.path.join(cache_configs['save_dir_base'], repo_folder_name(model_id)) 38 | 39 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 40 | tokenizer.pad_token = tokenizer.eos_token 41 | 42 | use_tritonv2 = False 43 | disable_exllama = False 44 | disable_exllamav2 = False 45 | 46 | if cache_configs['backend'] == 'TRITONV2': 47 | use_tritonv2 = True 48 | elif cache_configs['backend'] == 'EXLLAMA': 49 | disable_exllama = False 50 | disable_exllamav2 = True 51 | 52 | model = AutoGPTQForCausalLM.from_quantized( 53 | save_dir, 54 | torch_dtype=torch.float16, 55 | local_files_only=True, 56 | device='cpu', 57 | trust_remote_code=True, 58 | revision=cache_configs['model_revision'], 59 | use_tritonv2=use_tritonv2, 60 | disable_exllama = disable_exllama, 61 | disable_exllamav2 = disable_exllamav2, 62 | ) 63 | 64 | sparse_llm_cache.utils.inject_model(model.model, **cache_configs) 65 | model.to('cuda') 66 | 67 | time_profiler = TimeProfiler() 68 | recursive_attach(model, time_profiler, '_time_profiler') 69 | 70 | model.eval() 71 | 72 | return model, tokenizer, time_profiler 73 | 74 | def gen_batch(model, tokenizer, text_list, do_print=False, max_new_tokens=100): 75 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 76 | input_len = inputs['input_ids'].shape[1] 77 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 78 | output_len = outputs.shape[1] - input_len 79 | outputs = outputs[:, input_len:] 80 | output_len = outputs.shape[1] 81 | output_str = tokenizer.batch_decode(outputs) 82 | if do_print: 83 | print(text_list, output_str, flush=True) 84 | return input_len, output_len 85 | 86 | def load_prompt_list(cache_configs): 87 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 88 | print(dataset_path) 89 | prompts = torch.load(dataset_path) 90 | 91 | from torch.utils.data import Dataset 92 | class StringListDataset(Dataset): 93 | def __init__(self, string_list): 94 | self.string_list = string_list 95 | def __len__(self): 96 | return len(self.string_list) 97 | def __getitem__(self, idx): 98 | return self.string_list[idx] 99 | ds = StringListDataset(prompts) 100 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 101 | return dl 102 | 103 | def main(cache_configs): 104 | load_time_start = time.time() 105 | model, tokenizer, time_profiler = load_model(cache_configs) 106 | load_model_time = time.time() - load_time_start 107 | print("Loading model...done", load_model_time) 108 | 109 | dl = load_prompt_list(cache_configs) 110 | 111 | eval_time_start = time.time() 112 | for seq_id,text_list in enumerate(dl): 113 | if seq_id >= cache_configs['max_num_batch']: 114 | print("max_num_batch reached") 115 | break 116 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 117 | input_len, output_len = gen_batch(model, tokenizer, text_list, max_new_tokens=128, do_print=True) 118 | eval_time = time.time() - eval_time_start 119 | 120 | time_profiler.log() 121 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 122 | print("load_model_time:", load_model_time) 123 | print("eval_time:", eval_time) 124 | 125 | if __name__ == "__main__": 126 | cache_configs = prepare_args() 127 | main(cache_configs) 128 | -------------------------------------------------------------------------------- /examples/small-demo-gptq/transformers-um.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | import json 7 | import time 8 | import torch 9 | from transformers.utils import logging 10 | from transformers.generation.utils import TimeProfiler, recursive_attach 11 | import torch 12 | torch.cuda.set_device(0) 13 | from transformers import AutoTokenizer 14 | from auto_gptq import AutoGPTQForCausalLM 15 | 16 | import sparse_llm_cache 17 | sparse_llm_cache.cpp_worker.auto_eat_cuda_memory() 18 | from sparse_llm_cache.utils import repo_folder_name 19 | 20 | def prepare_args(): 21 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 22 | parser = prepare_argparser() 23 | parser.add_argument('--save_dir', type=str, help='By default, it will be inferred from model_id at /') 24 | parser.add_argument('--save_dir_base', type=str, default='/code/gptq-models-4bits', help='Base directory to save the quantized model') 25 | parser.add_argument('--max_gpu_memory', type=float, default=0.8, help='Max gpu memory to use for the model') 26 | parser.add_argument("--backend", type=str, default='EXLLAMA', choices=['TRITONV2', 'EXLLAMA']) 27 | # parser.add_argument('--dtype', type=str, default='auto', help='Compute dtype. Default is auto infer from model config.json') 28 | cache_configs = parse_args(parser=parser) 29 | return cache_configs 30 | 31 | def load_model(cache_configs): 32 | model_id = cache_configs['model_id'] 33 | save_dir = cache_configs['save_dir'] 34 | if save_dir is None: 35 | save_dir = os.path.join(cache_configs['save_dir_base'], repo_folder_name(model_id)) 36 | 37 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 38 | tokenizer.pad_token = tokenizer.eos_token 39 | 40 | use_tritonv2 = False 41 | disable_exllama = False 42 | disable_exllamav2 = False 43 | 44 | if cache_configs['backend'] == 'TRITONV2': 45 | use_tritonv2 = True 46 | elif cache_configs['backend'] == 'EXLLAMA': 47 | disable_exllama = False 48 | disable_exllamav2 = True 49 | 50 | model = AutoGPTQForCausalLM.from_quantized( 51 | save_dir, 52 | torch_dtype=torch.float16, 53 | local_files_only=True, 54 | device='cpu', 55 | trust_remote_code=True, 56 | revision=cache_configs['model_revision'], 57 | use_tritonv2=use_tritonv2, 58 | disable_exllama = disable_exllama, 59 | disable_exllamav2 = disable_exllamav2, 60 | ) 61 | 62 | sparse_llm_cache.utils.inject_model_um(model.model, model_id) 63 | model.to('cuda') 64 | 65 | time_profiler = TimeProfiler() 66 | recursive_attach(model, time_profiler, '_time_profiler') 67 | 68 | model.eval() 69 | 70 | return model, tokenizer, time_profiler 71 | 72 | def gen_batch(model, tokenizer, text_list, do_print=False, max_new_tokens=100): 73 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 74 | input_len = inputs['input_ids'].shape[1] 75 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 76 | output_len = outputs.shape[1] - input_len 77 | outputs = outputs[:, input_len:] 78 | output_len = outputs.shape[1] 79 | output_str = tokenizer.batch_decode(outputs) 80 | if do_print: 81 | print(text_list, output_str, flush=True) 82 | return input_len, output_len 83 | 84 | def load_prompt_list(cache_configs): 85 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 86 | print(dataset_path) 87 | prompts = torch.load(dataset_path) 88 | 89 | from torch.utils.data import Dataset 90 | class StringListDataset(Dataset): 91 | def __init__(self, string_list): 92 | self.string_list = string_list 93 | def __len__(self): 94 | return len(self.string_list) 95 | def __getitem__(self, idx): 96 | return self.string_list[idx] 97 | ds = StringListDataset(prompts) 98 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 99 | return dl 100 | 101 | def main(cache_configs): 102 | load_time_start = time.time() 103 | model, tokenizer, time_profiler = load_model(cache_configs) 104 | load_model_time = time.time() - load_time_start 105 | print("Loading model...done", load_model_time) 106 | 107 | dl = load_prompt_list(cache_configs) 108 | 109 | eval_time_start = time.time() 110 | for seq_id,text_list in enumerate(dl): 111 | if seq_id >= cache_configs['max_num_batch']: 112 | print("max_num_batch reached") 113 | break 114 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 115 | input_len, output_len = gen_batch(model, tokenizer, text_list, max_new_tokens=128, do_print=True) 116 | eval_time = time.time() - eval_time_start 117 | 118 | time_profiler.log() 119 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 120 | print("load_model_time:", load_model_time) 121 | print("eval_time:", eval_time) 122 | 123 | if __name__ == "__main__": 124 | cache_configs = prepare_args() 125 | main(cache_configs) -------------------------------------------------------------------------------- /examples/small-demo-hqq/transformers-accelerate.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True" 6 | 7 | import json 8 | import time 9 | import torch 10 | from transformers.utils import logging 11 | logging.disable_progress_bar() 12 | from transformers.generation.utils import TimeProfiler, recursive_attach 13 | torch.cuda.set_device(0) 14 | from transformers import AutoModelForCausalLM, AutoTokenizer 15 | 16 | import sparse_llm_cache 17 | sparse_llm_cache.cpp_worker.auto_eat_cuda_memory() 18 | from sparse_llm_cache.utils import repo_folder_name 19 | 20 | from hqq.models.hf.base import AutoHQQHFModel 21 | from hqq.core.quantize import HQQLinear, HQQBackend 22 | from hqq.utils.patching import prepare_for_inference 23 | import argparse 24 | from accelerate import dispatch_model, infer_auto_device_map 25 | from accelerate.utils import get_max_memory 26 | 27 | def prepare_args(): 28 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 29 | parser = prepare_argparser() 30 | parser.add_argument('--save_dir', type=str, help='By default, it will be inferred from model_id at /') 31 | parser.add_argument('--save_dir_base', type=str, default='/code/hqq-quant-model', help='Base directory to save the quantized model') 32 | parser.add_argument('--max_gpu_memory', type=float, default=0.8, help='Max gpu memory to use for the model') 33 | # parser.add_argument('--dtype', type=str, default='auto', help='Compute dtype. Default is auto infer from model config.json') 34 | # parser.add_argument('--backend', type=str, default='aten', choices=['none', 'torch', 'torchao', 'marlin', 'bitblas', 'aten'], help='Backend for inference. Default is aten. Currently aten is the fastest backend.') 35 | cache_configs = parse_args(parser=parser) 36 | return cache_configs 37 | 38 | def load_model(cache_configs): 39 | model_id = cache_configs['model_id'] 40 | save_dir = cache_configs['save_dir'] 41 | if save_dir is None: 42 | save_dir = os.path.join(cache_configs['save_dir_base'], repo_folder_name(model_id)) 43 | # if cache_configs['backend'] == 'aten': 44 | # HQQLinear.set_backend(HQQBackend.ATEN) 45 | HQQLinear.set_backend(HQQBackend.ATEN) 46 | 47 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 48 | tokenizer.pad_token = tokenizer.eos_token 49 | 50 | with open(os.path.join(save_dir, 'config.json'), 'r') as f: 51 | dtype = json.load(f)['torch_dtype'] 52 | dtype = getattr(torch, dtype) 53 | 54 | print(f"Loading model {model_id} with dtype {dtype} and backend aten") 55 | model = AutoHQQHFModel.from_quantized(save_dir, trust_remote_code=True, device='cpu', compute_dtype=dtype) 56 | 57 | # device_map = infer_auto_device_map(model, max_memory={0: "6GiB", 'cpu': "120GiB"}) 58 | max_memory = get_max_memory() 59 | max_memory[0] = int(max_memory[0] * cache_configs['max_gpu_memory']) 60 | device_map = infer_auto_device_map(model, max_memory=max_memory) 61 | print(device_map) 62 | model = dispatch_model(model, device_map=device_map, main_device=0, offload_buffers=True) 63 | 64 | time_profiler = TimeProfiler() 65 | recursive_attach(model, time_profiler, '_time_profiler') 66 | 67 | model.eval() 68 | 69 | return model, tokenizer, time_profiler 70 | 71 | def gen_batch(model, tokenizer, text_list, do_print=False, max_new_tokens=100): 72 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 73 | input_len = inputs['input_ids'].shape[1] 74 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 75 | output_len = outputs.shape[1] - input_len 76 | outputs = outputs[:, input_len:] 77 | output_len = outputs.shape[1] 78 | output_str = tokenizer.batch_decode(outputs) 79 | if do_print: 80 | print(text_list, output_str, flush=True) 81 | return input_len, output_len 82 | 83 | def load_prompt_list(cache_configs): 84 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 85 | print(dataset_path) 86 | prompts = torch.load(dataset_path) 87 | 88 | from torch.utils.data import Dataset 89 | class StringListDataset(Dataset): 90 | def __init__(self, string_list): 91 | self.string_list = string_list 92 | def __len__(self): 93 | return len(self.string_list) 94 | def __getitem__(self, idx): 95 | return self.string_list[idx] 96 | ds = StringListDataset(prompts) 97 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 98 | return dl 99 | 100 | def main(cache_configs): 101 | load_time_start = time.time() 102 | model, tokenizer, time_profiler = load_model(cache_configs) 103 | load_model_time = time.time() - load_time_start 104 | print("Loading model...done", load_model_time) 105 | 106 | dl = load_prompt_list(cache_configs) 107 | 108 | eval_time_start = time.time() 109 | for seq_id,text_list in enumerate(dl): 110 | if seq_id >= cache_configs['max_num_batch']: 111 | print("max_num_batch reached") 112 | break 113 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 114 | input_len, output_len = gen_batch(model, tokenizer, text_list, max_new_tokens=128, do_print=True) 115 | eval_time = time.time() - eval_time_start 116 | 117 | time_profiler.log() 118 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 119 | print("load_model_time:", load_model_time) 120 | print("eval_time:", eval_time) 121 | 122 | if __name__ == "__main__": 123 | cache_configs = prepare_args() 124 | main(cache_configs) 125 | -------------------------------------------------------------------------------- /examples/small-demo-hqq/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | import json 7 | import time 8 | import torch 9 | from transformers.utils import logging 10 | logging.disable_progress_bar() 11 | from transformers.generation.utils import TimeProfiler, recursive_attach 12 | torch.cuda.set_device(0) 13 | from transformers import AutoModelForCausalLM, AutoTokenizer 14 | 15 | import sparse_llm_cache 16 | from sparse_llm_cache.utils import repo_folder_name 17 | 18 | from hqq.models.hf.base import AutoHQQHFModel 19 | from hqq.core.quantize import HQQLinear, HQQBackend 20 | from hqq.utils.patching import prepare_for_inference 21 | import argparse 22 | from accelerate import dispatch_model, infer_auto_device_map 23 | 24 | def prepare_args(): 25 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 26 | parser = prepare_argparser() 27 | parser.add_argument('--save_dir', type=str, help='By default, it will be inferred from model_id at /') 28 | parser.add_argument('--save_dir_base', type=str, default='/code/hqq-quant-model', help='Base directory to save the quantized model') 29 | # parser.add_argument('--dtype', type=str, default='auto', help='Compute dtype. Default is auto infer from model config.json') 30 | # parser.add_argument('--backend', type=str, default='aten', choices=['none', 'torch', 'torchao', 'marlin', 'bitblas', 'aten'], help='Backend for inference. Default is aten. Currently aten is the fastest backend.') 31 | cache_configs = parse_args(parser=parser) 32 | return cache_configs 33 | 34 | def load_model(cache_configs): 35 | model_id = cache_configs['model_id'] 36 | save_dir = cache_configs['save_dir'] 37 | if save_dir is None: 38 | save_dir = os.path.join(cache_configs['save_dir_base'], repo_folder_name(model_id)) 39 | # if cache_configs['backend'] == 'aten': 40 | # HQQLinear.set_backend(HQQBackend.ATEN) 41 | HQQLinear.set_backend(HQQBackend.ATEN) 42 | 43 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 44 | tokenizer.pad_token = tokenizer.eos_token 45 | 46 | with open(os.path.join(save_dir, 'config.json'), 'r') as f: 47 | dtype = json.load(f)['torch_dtype'] 48 | dtype = getattr(torch, dtype) 49 | 50 | print(f"Loading model {model_id} with dtype {dtype} and backend aten") 51 | model = AutoHQQHFModel.from_quantized(save_dir, trust_remote_code=True, device='cpu', compute_dtype=dtype) 52 | 53 | sparse_llm_cache.utils.inject_model(model, **cache_configs) 54 | model.to('cuda') 55 | 56 | time_profiler = TimeProfiler() 57 | recursive_attach(model, time_profiler, '_time_profiler') 58 | 59 | model.eval() 60 | 61 | return model, tokenizer, time_profiler 62 | 63 | def gen_batch(model, tokenizer, text_list, do_print=False, max_new_tokens=100): 64 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 65 | input_len = inputs['input_ids'].shape[1] 66 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 67 | output_len = outputs.shape[1] - input_len 68 | outputs = outputs[:, input_len:] 69 | output_len = outputs.shape[1] 70 | output_str = tokenizer.batch_decode(outputs) 71 | if do_print: 72 | print(text_list, output_str, flush=True) 73 | return input_len, output_len 74 | 75 | def load_prompt_list(cache_configs): 76 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 77 | print(dataset_path) 78 | prompts = torch.load(dataset_path) 79 | 80 | from torch.utils.data import Dataset 81 | class StringListDataset(Dataset): 82 | def __init__(self, string_list): 83 | self.string_list = string_list 84 | def __len__(self): 85 | return len(self.string_list) 86 | def __getitem__(self, idx): 87 | return self.string_list[idx] 88 | ds = StringListDataset(prompts) 89 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 90 | return dl 91 | 92 | def main(cache_configs): 93 | load_time_start = time.time() 94 | model, tokenizer, time_profiler = load_model(cache_configs) 95 | load_model_time = time.time() - load_time_start 96 | print("Loading model...done", load_model_time) 97 | 98 | dl = load_prompt_list(cache_configs) 99 | 100 | eval_time_start = time.time() 101 | for seq_id,text_list in enumerate(dl): 102 | if seq_id >= cache_configs['max_num_batch']: 103 | print("max_num_batch reached") 104 | break 105 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 106 | input_len, output_len = gen_batch(model, tokenizer, text_list, max_new_tokens=128, do_print=True) 107 | eval_time = time.time() - eval_time_start 108 | 109 | time_profiler.log() 110 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 111 | print("load_model_time:", load_model_time) 112 | print("eval_time:", eval_time) 113 | 114 | if __name__ == "__main__": 115 | cache_configs = prepare_args() 116 | main(cache_configs) 117 | -------------------------------------------------------------------------------- /examples/small-demo-hqq/transformers-um.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | import json 7 | import time 8 | import torch 9 | from transformers.utils import logging 10 | logging.disable_progress_bar() 11 | from transformers.generation.utils import TimeProfiler, recursive_attach 12 | torch.cuda.set_device(0) 13 | from transformers import AutoModelForCausalLM, AutoTokenizer 14 | 15 | import sparse_llm_cache 16 | sparse_llm_cache.cpp_worker.auto_eat_cuda_memory() 17 | from sparse_llm_cache.utils import repo_folder_name 18 | 19 | from hqq.models.hf.base import AutoHQQHFModel 20 | from hqq.core.quantize import HQQLinear, HQQBackend 21 | from hqq.utils.patching import prepare_for_inference 22 | import argparse 23 | from accelerate import dispatch_model, infer_auto_device_map 24 | 25 | def prepare_args(): 26 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 27 | parser = prepare_argparser() 28 | parser.add_argument('--save_dir', type=str, help='By default, it will be inferred from model_id at /') 29 | parser.add_argument('--save_dir_base', type=str, default='/code/hqq-quant-model', help='Base directory to save the quantized model') 30 | # parser.add_argument('--dtype', type=str, default='auto', help='Compute dtype. Default is auto infer from model config.json') 31 | # parser.add_argument('--backend', type=str, default='aten', choices=['none', 'torch', 'torchao', 'marlin', 'bitblas', 'aten'], help='Backend for inference. Default is aten. Currently aten is the fastest backend.') 32 | cache_configs = parse_args(parser=parser) 33 | return cache_configs 34 | 35 | def load_model(cache_configs): 36 | model_id = cache_configs['model_id'] 37 | save_dir = cache_configs['save_dir'] 38 | if save_dir is None: 39 | save_dir = os.path.join(cache_configs['save_dir_base'], repo_folder_name(model_id)) 40 | # if cache_configs['backend'] == 'aten': 41 | # HQQLinear.set_backend(HQQBackend.ATEN) 42 | HQQLinear.set_backend(HQQBackend.ATEN) 43 | 44 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 45 | tokenizer.pad_token = tokenizer.eos_token 46 | 47 | with open(os.path.join(save_dir, 'config.json'), 'r') as f: 48 | dtype = json.load(f)['torch_dtype'] 49 | dtype = getattr(torch, dtype) 50 | 51 | print(f"Loading model {model_id} with dtype {dtype} and backend aten") 52 | model = AutoHQQHFModel.from_quantized(save_dir, trust_remote_code=True, device='cpu', compute_dtype=dtype) 53 | 54 | sparse_llm_cache.utils.inject_model_um(model, model_id) 55 | model.to('cuda') 56 | 57 | time_profiler = TimeProfiler() 58 | recursive_attach(model, time_profiler, '_time_profiler') 59 | 60 | model.eval() 61 | 62 | return model, tokenizer, time_profiler 63 | 64 | def gen_batch(model, tokenizer, text_list, do_print=False, max_new_tokens=100): 65 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 66 | input_len = inputs['input_ids'].shape[1] 67 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 68 | output_len = outputs.shape[1] - input_len 69 | outputs = outputs[:, input_len:] 70 | output_len = outputs.shape[1] 71 | output_str = tokenizer.batch_decode(outputs) 72 | if do_print: 73 | print(text_list, output_str, flush=True) 74 | return input_len, output_len 75 | 76 | def load_prompt_list(cache_configs): 77 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 78 | print(dataset_path) 79 | prompts = torch.load(dataset_path) 80 | 81 | from torch.utils.data import Dataset 82 | class StringListDataset(Dataset): 83 | def __init__(self, string_list): 84 | self.string_list = string_list 85 | def __len__(self): 86 | return len(self.string_list) 87 | def __getitem__(self, idx): 88 | return self.string_list[idx] 89 | ds = StringListDataset(prompts) 90 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 91 | return dl 92 | 93 | def main(cache_configs): 94 | load_time_start = time.time() 95 | model, tokenizer, time_profiler = load_model(cache_configs) 96 | load_model_time = time.time() - load_time_start 97 | print("Loading model...done", load_model_time) 98 | 99 | dl = load_prompt_list(cache_configs) 100 | 101 | eval_time_start = time.time() 102 | for seq_id,text_list in enumerate(dl): 103 | if seq_id >= cache_configs['max_num_batch']: 104 | print("max_num_batch reached") 105 | break 106 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 107 | input_len, output_len = gen_batch(model, tokenizer, text_list, max_new_tokens=128, do_print=True) 108 | eval_time = time.time() - eval_time_start 109 | 110 | time_profiler.log() 111 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 112 | print("load_model_time:", load_model_time) 113 | print("eval_time:", eval_time) 114 | 115 | if __name__ == "__main__": 116 | cache_configs = prepare_args() 117 | main(cache_configs) 118 | -------------------------------------------------------------------------------- /examples/small-demo/Makefile: -------------------------------------------------------------------------------- 1 | run-base: 2 | @python3 transformers-app.py --model_id deepseek-ai/deepseek-moe-16b-chat --dataset chatgpt-prompts-small --batch_size 1 --num_predict_expert_per_layer 0 --cache_rate 0.375 --cache_policy lru --per_layer_cache True --reorder_experts False --early_preempt False --predict_input_mode moe_layer_logits --layer_predict_interval 1 --layer_predict_max_window 3 --layer_predict_use_last_output True --predictor_model_path /code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits 3 | run-ours: 4 | @python3 transformers-app.py --model_id deepseek-ai/deepseek-moe-16b-chat --dataset chatgpt-prompts-small --batch_size 1 --num_predict_expert_per_layer 6 --cache_rate 0.375 --cache_policy lru --per_layer_cache True --reorder_experts True --early_preempt True --predict_input_mode moe_layer_logits --layer_predict_interval 1 --layer_predict_max_window 3 --layer_predict_use_last_output True --predictor_model_path /code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits -------------------------------------------------------------------------------- /examples/small-demo/transformers-accelerate.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True" 6 | 7 | from transformers.utils import logging 8 | from transformers.generation.utils import TimeProfiler, recursive_attach 9 | import torch 10 | torch.cuda.set_device(0) 11 | from transformers import AutoModelForCausalLM, AutoTokenizer 12 | 13 | import sparse_llm_cache 14 | sparse_llm_cache.cpp_worker.auto_eat_cuda_memory() 15 | import time 16 | 17 | from sparse_llm_cache.utils.runner_util import parse_args, prepare_argparser 18 | parser = prepare_argparser() 19 | parser.add_argument('--max_gpu_memory', type=float, default=None, help='Max gpu memory to use for the model') 20 | cache_configs = parse_args(parser=parser) 21 | for k, v in cache_configs.items(): print(k,v) 22 | 23 | from accelerate.utils import get_max_memory 24 | max_memory = None 25 | if cache_configs['max_gpu_memory'] is not None: 26 | max_memory = get_max_memory() 27 | max_memory[0] = int(max_memory[0] * cache_configs['max_gpu_memory']) 28 | 29 | print("loading model...") 30 | load_time_start = time.time() 31 | logging.disable_progress_bar() 32 | model_id = cache_configs['model_id'] 33 | torch_dtype = 'auto' 34 | if 'GPTQ' in model_id: 35 | torch_dtype = None 36 | print("dtype is", torch_dtype) 37 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 38 | tokenizer.pad_token = tokenizer.eos_token 39 | model = AutoModelForCausalLM.from_pretrained( 40 | model_id, 41 | # torch_dtype=torch.float16, 42 | torch_dtype=torch_dtype, 43 | local_files_only=True, 44 | device_map='auto', 45 | max_memory=max_memory, 46 | trust_remote_code=True, 47 | revision=cache_configs['model_revision'], 48 | ) 49 | load_model_time = time.time() - load_time_start 50 | print("loading model...done", load_model_time) 51 | 52 | time_profiler = TimeProfiler() 53 | recursive_attach(model, time_profiler, '_time_profiler') 54 | 55 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 56 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 57 | input_len = inputs['input_ids'].shape[1] 58 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 59 | output_len = outputs.shape[1] - input_len 60 | outputs = outputs[:, input_len:] 61 | output_len = outputs.shape[1] 62 | output_str = tokenizer.batch_decode(outputs) 63 | if do_print: 64 | print(text_list, output_str, flush=True) 65 | return input_len, output_len 66 | 67 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 68 | print(dataset_path) 69 | prompts = torch.load(dataset_path) 70 | 71 | from torch.utils.data import Dataset 72 | class StringListDataset(Dataset): 73 | def __init__(self, string_list): 74 | self.string_list = string_list 75 | def __len__(self): 76 | return len(self.string_list) 77 | def __getitem__(self, idx): 78 | return self.string_list[idx] 79 | ds = StringListDataset(prompts) 80 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 81 | 82 | eval_time_start = time.time() 83 | for seq_id,text_list in enumerate(dl): 84 | if seq_id >= cache_configs['max_num_batch']: 85 | print("max_num_batch reached") 86 | break 87 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 88 | input_len, output_len = gen_batch(text_list, max_new_tokens=128, do_print=True) 89 | eval_time = time.time() - eval_time_start 90 | 91 | time_profiler.log() 92 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 93 | print("load_model_time:", load_model_time) 94 | print("eval_time:", eval_time) 95 | -------------------------------------------------------------------------------- /examples/small-demo/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | from transformers.utils import logging 7 | from transformers.generation.utils import TimeProfiler, recursive_attach 8 | import torch 9 | torch.cuda.set_device(0) 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | import sparse_llm_cache 13 | import time 14 | 15 | from sparse_llm_cache.utils.runner_util import parse_args 16 | cache_configs = parse_args() 17 | for k, v in cache_configs.items(): print(k,v) 18 | 19 | sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 20 | 21 | print("loading model...") 22 | load_time_start = time.time() 23 | logging.disable_progress_bar() 24 | model_id = cache_configs['model_id'] 25 | torch_dtype = 'auto' 26 | if 'GPTQ' in model_id: 27 | torch_dtype = None 28 | print("dtype is", torch_dtype) 29 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 30 | tokenizer.pad_token = tokenizer.eos_token 31 | model = AutoModelForCausalLM.from_pretrained( 32 | model_id, 33 | # torch_dtype=torch.float16, 34 | torch_dtype=torch_dtype, 35 | local_files_only=True, 36 | device_map=0, 37 | trust_remote_code=True, 38 | revision=cache_configs['model_revision'], 39 | ) 40 | load_model_time = time.time() - load_time_start 41 | print("loading model...done", load_model_time) 42 | 43 | time_profiler = TimeProfiler() 44 | recursive_attach(model, time_profiler, '_time_profiler') 45 | 46 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 47 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 48 | input_len = inputs['input_ids'].shape[1] 49 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 50 | output_len = outputs.shape[1] - input_len 51 | outputs = outputs[:, input_len:] 52 | output_len = outputs.shape[1] 53 | output_str = tokenizer.batch_decode(outputs) 54 | if do_print: 55 | print(text_list, output_str, flush=True) 56 | return input_len, output_len 57 | 58 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 59 | print(dataset_path) 60 | prompts = torch.load(dataset_path) 61 | 62 | from torch.utils.data import Dataset 63 | class StringListDataset(Dataset): 64 | def __init__(self, string_list): 65 | self.string_list = string_list 66 | def __len__(self): 67 | return len(self.string_list) 68 | def __getitem__(self, idx): 69 | return self.string_list[idx] 70 | ds = StringListDataset(prompts) 71 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 72 | 73 | eval_time_start = time.time() 74 | for seq_id,text_list in enumerate(dl): 75 | if seq_id >= cache_configs['max_num_batch']: 76 | print("max_num_batch reached") 77 | break 78 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 79 | input_len, output_len = gen_batch(text_list, max_new_tokens=128, do_print=True) 80 | eval_time = time.time() - eval_time_start 81 | 82 | time_profiler.log() 83 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 84 | print("load_model_time:", load_model_time) 85 | print("eval_time:", eval_time) 86 | -------------------------------------------------------------------------------- /examples/small-demo/transformers-um.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['HF_HUB_OFFLINE'] = "1" 4 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 5 | 6 | from transformers.utils import logging 7 | from transformers.generation.utils import TimeProfiler, recursive_attach 8 | import torch 9 | torch.cuda.set_device(0) 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | import sparse_llm_cache 13 | sparse_llm_cache.cpp_worker.auto_eat_cuda_memory() 14 | import time 15 | 16 | from sparse_llm_cache.utils.runner_util import parse_args 17 | cache_configs = parse_args() 18 | for k, v in cache_configs.items(): print(k,v) 19 | 20 | sparse_llm_cache.utils.hack_transformers_um() 21 | 22 | print("loading model...") 23 | load_time_start = time.time() 24 | logging.disable_progress_bar() 25 | model_id = cache_configs['model_id'] 26 | torch_dtype = 'auto' 27 | if 'GPTQ' in model_id: 28 | torch_dtype = None 29 | print("dtype is", torch_dtype) 30 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 31 | tokenizer.pad_token = tokenizer.eos_token 32 | model = AutoModelForCausalLM.from_pretrained( 33 | model_id, 34 | # torch_dtype=torch.float16, 35 | torch_dtype=torch_dtype, 36 | local_files_only=True, 37 | device_map=0, 38 | trust_remote_code=True, 39 | revision=cache_configs['model_revision'], 40 | ) 41 | load_model_time = time.time() - load_time_start 42 | print("loading model...done", load_model_time) 43 | 44 | time_profiler = TimeProfiler() 45 | recursive_attach(model, time_profiler, '_time_profiler') 46 | 47 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 48 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 49 | input_len = inputs['input_ids'].shape[1] 50 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 51 | output_len = outputs.shape[1] - input_len 52 | outputs = outputs[:, input_len:] 53 | output_len = outputs.shape[1] 54 | output_str = tokenizer.batch_decode(outputs) 55 | if do_print: 56 | print(text_list, output_str, flush=True) 57 | return input_len, output_len 58 | 59 | dataset_path = f'/code/sparse-llm-cache-scripts/dataset/{cache_configs["dataset"]}/prompt_list.pt' 60 | print(dataset_path) 61 | prompts = torch.load(dataset_path) 62 | 63 | from torch.utils.data import Dataset 64 | class StringListDataset(Dataset): 65 | def __init__(self, string_list): 66 | self.string_list = string_list 67 | def __len__(self): 68 | return len(self.string_list) 69 | def __getitem__(self, idx): 70 | return self.string_list[idx] 71 | ds = StringListDataset(prompts) 72 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 73 | 74 | eval_time_start = time.time() 75 | for seq_id,text_list in enumerate(dl): 76 | if seq_id >= cache_configs['max_num_batch']: 77 | print("max_num_batch reached") 78 | break 79 | print(f'Seq {seq_id}/{cache_configs["max_num_batch"]}, decoding...', flush=True) 80 | input_len, output_len = gen_batch(text_list, max_new_tokens=128, do_print=True) 81 | eval_time = time.time() - eval_time_start 82 | 83 | time_profiler.log() 84 | sparse_llm_cache.cpp_worker.log_gpu_mem_info() 85 | print("load_model_time:", load_model_time) 86 | print("eval_time:", eval_time) 87 | -------------------------------------------------------------------------------- /install.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | ## Docker (llama.cpp) 4 | 5 | First build docker image and create container 6 | 7 | ```shell 8 | cd docker 9 | docker build -t moe-cache-llama.cpp -f Dockerfile.llama.cpp . 10 | docker run --runtime nvidia --gpus all --shm-size=200g --ulimit memlock=-1 --ulimit core=0 --privileged=true --ipc=host --name moe-cache-llama.cpp-demo -it moe-cache-llama.cpp bash 11 | ``` 12 | 13 | It it recommended to map host directories with large volume or code repos into container: 14 | 15 | ```bash 16 | docker run --runtime nvidia --gpus all --shm-size=200g --ulimit memlock=-1 --ulimit core=0 --privileged=true -v :/root/.cache/huggingface -v :/code --ipc=host --name moe-cache-llama.cpp-demo -it moe-cache-llama.cpp bash 17 | ``` 18 | 19 | Then clone and build related repos 20 | 21 | ```bash 22 | mkdir -p /code 23 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/transformers.git && cd transformers && pip install -e . --no-build-isolation 24 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/eval-helper.git && cd eval-helper && pip install -e . --no-build-isolation 25 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/expert-selection-tracer.git && cd expert-selection-tracer && pip install -e . --no-build-isolation 26 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/hqq.git && cd hqq && pip install -e . --no-build-isolation 27 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/sparse-llm-cache-scripts.git 28 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/llama.cpp.git 29 | cd /code && git clone git@ipads.se.sjtu.edu.cn:sparsellm/sparse-llm-cache.git 30 | # build hqq-extension 31 | cd /code/hqq/hqq/kernels 32 | python3 setup_cuda.py install 33 | # build sparse-llm-cache (python ver.) 34 | cd /code/sparse-llm-cache 35 | pip install -e . --no-build-isolation 36 | # build sparse-llm-cache (cpp ver.) 37 | cd /code/sparse-llm-cache 38 | cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_CUDA_ARCHITECTURES="native" 39 | cmake --build build --config Release --parallel 40 40 | # build llama.cpp 41 | cd /code/llama.cpp 42 | cmake -B build -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_CUDA_ARCHITECTURES="native" 43 | cmake --build build --config Release --parallel 40 44 | ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "sparse-llm-cache" 3 | version = "0.0.1" 4 | 5 | [build-system] 6 | requires = ["setuptools", "wheel"] 7 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='sparse_llm_cache', 6 | ext_modules=[ 7 | CUDAExtension('sparse_llm_cache.cpp_worker', 8 | [ 9 | 'src/cpp_worker/adapter-llama.cpp', 10 | 'src/cpp_worker/adapter.cpp', 11 | 'src/cpp_worker/logging.cc', 12 | 'src/cpp_worker/model_loader.cpp', 13 | 'src/cpp_worker/prefetcher.cpp', 14 | 'src/cpp_worker/predictor.cpp', 15 | 'src/cpp_worker/utils.cpp', 16 | 'src/cpp_worker/profiler.cpp', 17 | 'src/cpp_worker/cache.cpp', 18 | 'src/cpp_worker/worker.cpp', 19 | 'src/cpp_worker/cuda_helper_func.cu', 20 | ], 21 | extra_compile_args={'cxx': ['-g', '-fopenmp', '-Wno-sign-compare', '-Wno-attributes'], 'nvcc': ['-g']}, 22 | libraries = ['cuda'] 23 | ), 24 | ], 25 | include_dirs=['./3rdparty/json/single_include'], 26 | cmdclass={ 27 | 'build_ext': BuildExtension 28 | } 29 | ) 30 | -------------------------------------------------------------------------------- /src/cpp_worker/adapter-llama.cpp: -------------------------------------------------------------------------------- 1 | #include "adapter-llama.hpp" 2 | #include "logging.hpp" 3 | #include 4 | 5 | namespace { 6 | 7 | void *sparse_llm_cache_eated_cuda_memory = nullptr; 8 | uint64_t sparse_llm_cache_eated_cuda_memory_size = 0; 9 | 10 | } 11 | 12 | extern "C" { 13 | void get_gpu_mem_info(uint64_t *free_byte, uint64_t *total_byte) { 14 | CUDA_CALL(cudaMemGetInfo(free_byte, total_byte)); 15 | } 16 | 17 | void log_gpu_mem_info() { 18 | uint64_t free_byte, total_byte; 19 | get_gpu_mem_info(&free_byte, &total_byte); 20 | std::cerr << "gpu_memory_usage_MiB:" << (total_byte - free_byte) / (1024 * 1024) << std::endl; 21 | std::cerr << "eval_eaten_cuda_memory_MiB:" << sparse_llm_cache_eated_cuda_memory_size / (1024 * 1024) << std::endl; 22 | } 23 | 24 | void eat_cuda_memory(uint64_t nbytes) { 25 | if (sparse_llm_cache_eated_cuda_memory != nullptr) { 26 | CHECK(false) << "eat_cuda_memory is not allowed to be called twice"; 27 | } 28 | LOG(ERROR) << "eating cuda memory:" << nbytes / (1024 * 1024) << " MiB"; 29 | CUDA_CALL(cudaMalloc(&sparse_llm_cache_eated_cuda_memory, nbytes)); 30 | sparse_llm_cache_eated_cuda_memory_size = nbytes; 31 | } 32 | 33 | void auto_eat_cuda_memory() { 34 | uint64_t nbytes = 0; 35 | nbytes = std::stoull(GetEnv("SPARSE_EVAL_EAT_CUDA_MEMORY", "0")); 36 | if (nbytes > 0) { 37 | eat_cuda_memory(nbytes); 38 | return; 39 | } 40 | nbytes = std::stoull(GetEnv("SPARSE_EVAL_EAT_CUDA_MEMORY_MiB", "0")); 41 | if (nbytes > 0) { 42 | eat_cuda_memory(nbytes * 1024 * 1024); 43 | return; 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/cpp_worker/adapter-llama.hpp: -------------------------------------------------------------------------------- 1 | #include "model_loader.hpp" 2 | #include "prefetcher.hpp" 3 | #include "profiler.hpp" 4 | 5 | extern "C" { 6 | void get_gpu_mem_info(uint64_t* free, uint64_t* total); 7 | void log_gpu_mem_info(); 8 | void eat_cuda_memory(uint64_t nbytes); 9 | void auto_eat_cuda_memory(); 10 | } -------------------------------------------------------------------------------- /src/cpp_worker/adapter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "model_loader.hpp" 3 | #include "prefetcher.hpp" 4 | #include "profiler.hpp" 5 | #include "adapter-llama.hpp" 6 | 7 | std::string dump_trace_event_collector_singleton() { 8 | return TraceEventCollector::singleton().dump_json_to_string(); 9 | } 10 | 11 | torch::Tensor to_um(torch::Tensor t); 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | py::class_>(m, "ModuleMeta") 15 | .def(py::init()) 16 | .def_readwrite("model_arch_string", &ModuleMeta::model_arch_string) 17 | .def_readwrite("num_layer", &ModuleMeta::num_layer) 18 | .def_readwrite("num_expert", &ModuleMeta::num_expert) 19 | .def_readwrite("num_per_expert_param", &ModuleMeta::num_per_expert_param) 20 | .def_readwrite("num_predict_expert_per_layer", &ModuleMeta::num_predict_expert_per_layer) 21 | .def_readwrite("num_expert_per_token", &ModuleMeta::num_expert_per_token) 22 | .def_readwrite("max_prefetch_layer_distance", &ModuleMeta::max_prefetch_layer_distance) 23 | .def_readwrite("per_layer_cache", &ModuleMeta::per_layer_cache) 24 | .def_readwrite("cache_policy", &ModuleMeta::cache_policy) 25 | .def_readwrite("reorder_experts", &ModuleMeta::reorder_experts) 26 | .def_readwrite("promote_hit_in_prefetch", &ModuleMeta::promote_hit_in_prefetch) 27 | .def_readwrite("early_preempt", &ModuleMeta::early_preempt) 28 | .def_readwrite("chunk_prefetch", &ModuleMeta::chunk_prefetch) 29 | .def_readwrite("predict_input_mode", &ModuleMeta::predict_input_mode) 30 | .def_readwrite("predictor_type", &ModuleMeta::predictor_type) 31 | .def_readwrite("layer_predict_interval", &ModuleMeta::layer_predict_interval) 32 | .def_readwrite("layer_predict_max_window", &ModuleMeta::layer_predict_max_window) 33 | .def_readwrite("layer_predict_replace_first_input_with_last_output", &ModuleMeta::layer_predict_replace_first_input_with_last_output) 34 | .def_readwrite("limit_layer_0_window", &ModuleMeta::limit_layer_0_window) 35 | .def_readwrite("limit_layer_0_num_predict", &ModuleMeta::limit_layer_0_num_predict) 36 | .def_readwrite("cache_only", &ModuleMeta::cache_only) 37 | .def("init_from_map", &ModuleMeta::init_from_map) 38 | .def("handle_uninited_configs", &ModuleMeta::handle_uninited_configs) 39 | .def("init_param_list", &ModuleMeta::init_param_list) 40 | ; 41 | 42 | py::class_>(m, "PredictorBase") 43 | // .def(py::init>()) 44 | // .def("load_model", &PredictorBase::load_model) 45 | .def("create", &PredictorBase::create) 46 | .def("load_model", &PredictorBase::load_model) 47 | ; 48 | py::class_, PredictorBase>(m, "LegacyPredictor") 49 | .def(py::init>()) 50 | .def("load_model", &LegacyPredictor::load_model) 51 | ; 52 | 53 | py::class_>(m, "ModelLoader") 54 | .def(py::init>()) 55 | .def("pin_memory", &ModelLoader::pin_memory) 56 | .def("add_one_expert_param", static_cast(&ModelLoader::add_one_expert_param)) 57 | .def("build_logical_expert_param", static_cast(&ModelLoader::build_logical_expert_param)) 58 | .def("ref_one_expert_param", static_cast(&ModelLoader::ref_one_expert_param)) 59 | .def("ref_one_expert_param", static_cast(&ModelLoader::ref_one_expert_param)) 60 | ; 61 | 62 | py::class_>(m, "CacheMngr") 63 | .def("set_cur_seq", &CacheMngr::set_cur_seq) 64 | .def_readwrite("cache_oracle", &CacheMngr::cache_oracle) 65 | ; 66 | 67 | py::class_>(m, "CacheOracle") 68 | .def("load_from_file", &CacheOracle::load_from_file) 69 | .def("load_from_tensor", &CacheOracle::load_from_tensor) 70 | ; 71 | 72 | py::class_>(m, "PrefetchMngr") 73 | .def(py::init, std::shared_ptr, std::shared_ptr>()) 74 | .def("launch_thread", &PrefetchMngr::launch_thread) 75 | .def("init_gpu_mem_buffer", &PrefetchMngr::init_gpu_mem_buffer) 76 | .def("reload_env", &PrefetchMngr::reload_env) 77 | .def("report_one_expert", &PrefetchMngr::report_one_expert) 78 | .def("one_expert_done", &PrefetchMngr::one_expert_done) 79 | .def("report_one_layer", static_cast(&PrefetchMngr::report_one_layer)) 80 | .def("one_moe_layer_done", &PrefetchMngr::one_moe_layer_done) 81 | .def("report_moe_attn_logits", &PrefetchMngr::report_moe_attn_logits) 82 | .def("report_moe_layer_logits", &PrefetchMngr::report_moe_layer_logits) 83 | .def("build_timer", &PrefetchMngr::build_timer) 84 | .def("temp_move_expert_to_gpu", &PrefetchMngr::temp_move_expert_to_gpu) 85 | .def("temp_move_expert_back_to_host", &PrefetchMngr::temp_move_expert_back_to_host) 86 | .def_readwrite("metas", &PrefetchMngr::metas) 87 | .def_readwrite("model_loader", &PrefetchMngr::model_loader) 88 | .def_readwrite("predictor", &PrefetchMngr::predictor) 89 | .def_readwrite("cache_stats", &PrefetchMngr::cache_stats) 90 | .def_readwrite("profiler", &PrefetchMngr::profiler) 91 | .def_readwrite("cache", &PrefetchMngr::cache) 92 | .def_readwrite("copy_stream", &PrefetchMngr::copy_stream) 93 | .def_readwrite("compute_stream", &PrefetchMngr::compute_stream) 94 | .def(py::pickle( 95 | [](std::shared_ptr a) { // dump 96 | // Store the address of the PrefetchMngr object as a Python bytes object 97 | return py::make_tuple( 98 | reinterpret_cast(a.get()) 99 | ); 100 | }, 101 | [](py::tuple t) { // load 102 | auto ptr = reinterpret_cast(t[0].cast()); 103 | return ptr->shared_from_this(); 104 | } 105 | )) 106 | ; 107 | 108 | py::class_>(m, "TraceEventGuard") 109 | .def(py::init<>()) 110 | .def("init", &TraceEventGuard::init, "docstring", py::arg(), py::arg(), py::arg("phase")='X') 111 | .def("release", &TraceEventGuard::release) 112 | ; 113 | 114 | py::class_>(m, "CacheStatistics") 115 | .def(py::init<>()) 116 | .def("to_tensor", &CacheStatistics::to_tensor) 117 | .def("dump_average", &CacheStatistics::dump_average) 118 | .def("dump_average_per_layer", &CacheStatistics::dump_average_per_layer) 119 | ; 120 | 121 | py::class_(m, "TimerGuard") 122 | .def("init", &TimerGuard::init) 123 | .def("release", &TimerGuard::release) 124 | ; 125 | 126 | m.def("dump_trace_event_collector_singleton", &dump_trace_event_collector_singleton); 127 | m.def("to_um", &to_um); 128 | m.def("log_gpu_mem_info", &log_gpu_mem_info); 129 | m.def("eat_cuda_memory", &eat_cuda_memory); 130 | m.def("auto_eat_cuda_memory", &auto_eat_cuda_memory); 131 | 132 | py::enum_(m, "ThreadType") 133 | .value("kPythonMain", ThreadType::kPythonMain) 134 | .value("kHook", ThreadType::kHook) 135 | .value("kFetchScheduler", ThreadType::kFetchScheduler) 136 | .value("kGPU", ThreadType::kGPU) 137 | .export_values(); 138 | 139 | py::enum_(m, "PredictInputMode") 140 | .value("kNoPredict", PredictInputMode::kNoPredict) 141 | .value("kOneToken", PredictInputMode::kOneToken) 142 | .value("kDecodeCumsum", PredictInputMode::kDecodeCumsum) 143 | .value("kLastUseDistance", PredictInputMode::kLastUseDistance) 144 | .value("kWeighedDecodeCumsum", PredictInputMode::kWeighedDecodeCumsum) 145 | .value("kFirstMoeAttnInputLogits", PredictInputMode::kFirstMoeAttnInputLogits) 146 | .value("kMoeAttnInputLogits", PredictInputMode::kMoeAttnInputLogits) 147 | .value("kMoeLayerLogits", PredictInputMode::kMoeLayerLogits) 148 | .export_values(); 149 | 150 | py::enum_(m, "PredictorType") 151 | .value("kLegacyPredictor", PredictorType::kLegacyPredictor) 152 | .value("kSepPredictor", PredictorType::kSepPredictor) 153 | .export_values(); 154 | 155 | py::enum_(m, "TimeType") 156 | .value("kModelForward", TimeProfiler::TimeType::kModelForward) 157 | .export_values(); 158 | }; -------------------------------------------------------------------------------- /src/cpp_worker/cuda_helper_func.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "utils.hpp" 4 | #include 5 | 6 | __global__ void sleep_kernel_nanosleep(uint us) { 7 | __nanosleep(us * 1000); 8 | } 9 | 10 | __global__ void sleep_kernel_chrono(uint us) { 11 | auto start = cuda::std::chrono::high_resolution_clock::now(); 12 | while (cuda::std::chrono::high_resolution_clock::now() - start < cuda::std::chrono::microseconds(us)) { 13 | __syncthreads(); 14 | } 15 | } 16 | 17 | void cuda_sleep(uint us, int64_t stream) { 18 | sleep_kernel_chrono<<<1, 1, 0, (cudaStream_t)stream>>>(us); 19 | } 20 | -------------------------------------------------------------------------------- /src/cpp_worker/logging.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "logging.hpp" 25 | 26 | // namespace coll_cache_lib { 27 | // namespace common { 28 | 29 | LogLevel LogMessage::min_log_level = MinLogLevelFromEnv(); 30 | 31 | LogMessage::LogMessage(const char* fname, int line, LogLevel severity) 32 | : fname_(fname), line_(line), severity_(severity) { 33 | should_output_ = (severity_ >= min_log_level); 34 | } 35 | 36 | void LogMessage::GenerateLogMessage(bool log_time) { 37 | bool use_cout = 38 | static_cast(severity_) <= static_cast(LogLevel::INFO); 39 | std::ostream& os = use_cout ? std::cout : std::cerr; 40 | std::stringstream ss; 41 | if (log_time) { 42 | auto now = std::chrono::system_clock::now(); 43 | auto as_time_t = std::chrono::system_clock::to_time_t(now); 44 | 45 | auto duration = now.time_since_epoch(); 46 | auto seconds = std::chrono::duration_cast(duration); 47 | auto micros_remainder = 48 | std::chrono::duration_cast(duration - 49 | seconds); 50 | 51 | const size_t time_buffer_size = 30; 52 | char time_buffer[time_buffer_size]; 53 | strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S", 54 | localtime(&as_time_t)); 55 | ss << "[" << time_buffer << "." << std::setw(6) << micros_remainder.count() 56 | << ": " << LOG_LEVELS[static_cast(severity_)] << " " << fname_ 57 | << ":" << line_ << "] " << osstream.str() << std::endl; 58 | } else { 59 | ss << "[" << LOG_LEVELS[static_cast(severity_)] << " " << fname_ << ":" 60 | << line_ << "] " << osstream.str() << std::endl; 61 | } 62 | os << ss.str(); 63 | os.flush(); 64 | } 65 | 66 | LogMessage::~LogMessage() { 67 | // static LogLevel min_log_level = MinLogLevelFromEnv(); 68 | static bool log_time = LogTimeFromEnv(); 69 | if (should_output_) { 70 | GenerateLogMessage(log_time); 71 | } 72 | } 73 | 74 | LogMessageFatal::LogMessageFatal(const char* file, int line) 75 | : LogMessage(file, line, LogLevel::FATAL) {} 76 | 77 | LogMessageFatal::~LogMessageFatal() { 78 | static bool log_time = LogTimeFromEnv(); 79 | GenerateLogMessage(log_time); 80 | abort(); 81 | } 82 | 83 | LogLevel ParseLogLevelStr(const char* env_var_val) { 84 | std::string min_log_level(env_var_val); 85 | std::transform(min_log_level.begin(), min_log_level.end(), 86 | min_log_level.begin(), ::tolower); 87 | if (min_log_level == "trace") { 88 | return LogLevel::TRACE; 89 | } else if (min_log_level == "debug") { 90 | return LogLevel::DEBUG; 91 | } else if (min_log_level == "info") { 92 | return LogLevel::INFO; 93 | } else if (min_log_level == "warning") { 94 | return LogLevel::WARNING; 95 | } else if (min_log_level == "error") { 96 | return LogLevel::ERROR; 97 | } else if (min_log_level == "fatal") { 98 | return LogLevel::FATAL; 99 | } else { 100 | return LogLevel::WARNING; 101 | } 102 | } 103 | 104 | LogLevel MinLogLevelFromEnv() { 105 | const char* env_var_val = getenv("SPARSE_CACHE_LOG_LEVEL"); 106 | if (env_var_val == nullptr) { 107 | // default to WARNING 108 | return LogLevel::WARNING; 109 | } 110 | return ParseLogLevelStr(env_var_val); 111 | } 112 | 113 | bool LogTimeFromEnv() { 114 | const char* env_var_val = getenv("SPARSE_CACHE_LOG_HIDE_TIME"); 115 | if (env_var_val != nullptr && std::strtol(env_var_val, nullptr, 10) > 0) { 116 | return false; 117 | } else { 118 | return true; 119 | } 120 | } 121 | 122 | // } // namespace common 123 | // } // namespace coll_cache_lib 124 | -------------------------------------------------------------------------------- /src/cpp_worker/logging.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | #pragma once 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | // namespace coll_cache_lib { 25 | // namespace common { 26 | 27 | enum class LogLevel { TRACE, DEBUG, INFO, WARNING, ERROR, FATAL }; 28 | 29 | #define LOG_LEVELS "TDIWEF" 30 | 31 | #ifdef CHECK 32 | #undef CHECK 33 | #undef LOG 34 | #endif 35 | // Always-on checking 36 | #define CHECK(x) \ 37 | if (!(x)) \ 38 | LogMessageFatal(__FILE__, __LINE__) << "Check failed: " #x << ' ' 39 | 40 | // #define CHECK_LT(x, y) CHECK((x) < (y)) 41 | // #define CHECK_GT(x, y) CHECK((x) > (y)) 42 | // #define CHECK_LE(x, y) CHECK((x) <= (y)) 43 | // #define CHECK_GE(x, y) CHECK((x) >= (y)) 44 | // #define CHECK_EQ(x, y) CHECK((x) == (y)) 45 | // #define CHECK_NE(x, y) CHECK((x) != (y)) 46 | /* 47 | // #define CHECK_NOTNULL(x) \ 48 | // ((x) == NULL ? LogMessageFatal(__FILE__, __LINE__) \ 49 | // << "Check notnull: " #x << ' ', \ 50 | // (x) : (x)) // NOLINT(*) 51 | */ 52 | 53 | /*! 54 | * \brief Protected CUDA call. 55 | * \param func Expression to call. 56 | * 57 | * It checks for CUDA errors after invocation of the expression. 58 | */ 59 | #define CUDA_CALL(func) \ 60 | { \ 61 | cudaError_t e = (func); \ 62 | CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 63 | << "[" << getpid() << "]CUDA: " << cudaGetErrorString(e); \ 64 | } 65 | #define CU_CALL(func) \ 66 | { \ 67 | CUresult e = (func); \ 68 | if (e != CUDA_SUCCESS) { \ 69 | const char ** desc = nullptr; \ 70 | cuGetErrorString(e, desc); \ 71 | CHECK(false) << "[" << getpid() << "]CU: " << desc; \ 72 | } \ 73 | } 74 | 75 | /*! 76 | * \brief Protected CUSPARSE call. 77 | */ 78 | #define CUSPARSE_CALL(func) \ 79 | { \ 80 | cusparseStatus_t e = (func); \ 81 | CHECK(e == CUSPARSE_STATUS_SUCCESS) \ 82 | << "CUSPARSE: " << cusparseGetErrorString(e); \ 83 | } 84 | 85 | /* 86 | * \brief Protected NCCL call. 87 | */ 88 | #define NCCLCHECK(cmd) \ 89 | { \ 90 | ncclResult_t r = (cmd); \ 91 | CHECK(r == ncclSuccess) << "NCCL error: " << ncclGetErrorString(r); \ 92 | } 93 | 94 | 95 | LogLevel MinLogLevelFromEnv(); 96 | bool LogTimeFromEnv(); 97 | 98 | // class LogMessage : public std::basic_ostringstream { 99 | class LogMessage { 100 | static LogLevel min_log_level; 101 | public: 102 | static void reload_env() { min_log_level = MinLogLevelFromEnv(); } 103 | LogMessage(const char* fname, int line, LogLevel severity); 104 | ~LogMessage(); 105 | 106 | template 107 | LogMessage & operator<<(T v) { 108 | if (should_output_) { 109 | osstream << v; 110 | } 111 | return *this; 112 | } 113 | 114 | bool should_output_; 115 | 116 | protected: 117 | void GenerateLogMessage(bool log_time); 118 | 119 | private: 120 | const char* fname_; 121 | int line_; 122 | LogLevel severity_; 123 | std::basic_ostringstream osstream; 124 | }; 125 | 126 | // LogMessageFatal ensures the process will exit in failure after 127 | // logging this message. 128 | class LogMessageFatal : public LogMessage { 129 | public: 130 | LogMessageFatal(const char* file, int line); 131 | ~LogMessageFatal(); 132 | }; 133 | 134 | #define _LOG_TRACE \ 135 | LogMessage(__FILE__, __LINE__, LogLevel::TRACE) 136 | #define _LOG_DEBUG \ 137 | LogMessage(__FILE__, __LINE__, LogLevel::DEBUG) 138 | #define _LOG_INFO LogMessage(__FILE__, __LINE__, LogLevel::INFO) 139 | #define _LOG_WARNING \ 140 | LogMessage(__FILE__, __LINE__, LogLevel::WARNING) 141 | #define _LOG_ERROR \ 142 | LogMessage(__FILE__, __LINE__, LogLevel::ERROR) 143 | #define _LOG_FATAL LogMessageFatal(__FILE__, __LINE__) 144 | 145 | #define _LOG(severity) _LOG_##severity 146 | 147 | #define _LOG_RANK(severity, rank) _LOG_##severity << "[" << rank << "]: " 148 | 149 | #define GET_LOG(_1, _2, NAME, ...) NAME 150 | #define LOG(...) GET_LOG(__VA_ARGS__, _LOG_RANK, _LOG)(__VA_ARGS__) 151 | 152 | #define LOG_BLOCK(severity, logger, ...) { \ 153 | auto logger = LOG(severity); \ 154 | if (logger.should_output_) { __VA_ARGS__; } \ 155 | } 156 | 157 | #define DEBUG_PREFIX "\033[1;33mDEBUG: \033[0m" 158 | #define WARNING_PREFIX "\033[38;5;215mWARNING: \033[0m" 159 | 160 | 161 | // } // namespace common 162 | // } // namespace coll_cache_lib 163 | -------------------------------------------------------------------------------- /src/cpp_worker/predictor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "profiler.hpp" 8 | #include "utils.hpp" 9 | 10 | struct PredictOutput { 11 | torch::Tensor prob; 12 | torch::Tensor experts; 13 | int input_layer_id = 0; 14 | int start_output_layer_id = 0; 15 | PredictOutput(torch::Tensor prob, int input_layer_id, int start_output_layer_id) : prob(prob), input_layer_id(input_layer_id), start_output_layer_id(start_output_layer_id) {} 16 | 17 | void slice_layer(int start, int stop) { 18 | prob = prob.slice(0, start, stop); 19 | start_output_layer_id = start_output_layer_id + start; 20 | } 21 | void slice_expert(int start, int stop) { 22 | prob = prob.slice(1, start, stop); 23 | } 24 | void rank_experts(int n_top_e) { 25 | n_top_e = std::min(n_top_e, num_output_expert()); 26 | auto sorted = prob.sort(-1, true); 27 | experts = std::get<1>(sorted).slice(1, 0, n_top_e); 28 | } 29 | 30 | int num_output_layer() const { return prob.size(0); } 31 | int num_output_expert() const { return prob.size(1); } 32 | int num_top_experts() const { return experts.size(1); } 33 | int inner_l_to_outer_l(int inner_l) const { return start_output_layer_id + inner_l; } 34 | int64_t * top_experts(int inner_l) const { return experts[inner_l].data_ptr(); } 35 | 36 | static PredictOutput empty(long num_output_layer, int input_layer_id, int start_output_layer_id) { 37 | return PredictOutput(torch::empty({num_output_layer, 0}, torch::kFloat32), input_layer_id, start_output_layer_id); 38 | } 39 | }; 40 | 41 | class PredictorBase { 42 | protected: 43 | std::shared_ptr metas; 44 | PredictorBase(std::shared_ptr metas) : metas(metas) {} 45 | public: 46 | std::shared_ptr profiler; 47 | cudaStream_t compute_stream; 48 | virtual PredictOutput predict(int input_layer_id) = 0; 49 | virtual void load_model() { this->load_model_from(metas->predictor_model_path); }; 50 | virtual void load_model_from(std::string model_path) = 0; 51 | virtual void add_one_layer(int layer_id, torch::Tensor experts) {}; 52 | virtual void add_one_layer(int layer_id, int64_t *experts, size_t num_expert) {}; 53 | virtual void record_moe_attn_logits(int layer_id, torch::Tensor attn_logits) {}; 54 | virtual void record_moe_layer_logits(int layer_id, torch::Tensor layer_logits) {}; 55 | virtual void end_of_one_token_prediction() {}; 56 | virtual void start_of_new_sequence() {}; 57 | virtual void slice_predict_output_layer(PredictOutput &output) = 0; 58 | virtual bool layer_predict_enabled(int layer_id) = 0; 59 | virtual ~PredictorBase() = default; 60 | 61 | virtual int query_predict_jobs(int layer_id) { return 1; } 62 | virtual PredictOutput predict_one_job(int layer_id, int job_idx) { return this->predict(layer_id); } 63 | 64 | static std::shared_ptr create(std::shared_ptr metas); 65 | }; 66 | 67 | class LegacyPredictor : public PredictorBase { 68 | private: 69 | struct PredictModel { 70 | torch::jit::script::Module model; 71 | // torch::ScalarType dtype = torch::ScalarType::Undefined; 72 | int orig_output_start_layer, orig_output_stop_layer; 73 | int slice_start, slice_stop; 74 | int orig_num_output_layer() const { return orig_output_stop_layer - orig_output_start_layer; } 75 | int output_layer_start() const { return orig_output_start_layer + slice_start; } 76 | int output_layer_stop() const { return orig_output_start_layer + slice_stop; } 77 | int output_layer(int l_in_slice) const { return orig_output_start_layer + slice_start + l_in_slice; } 78 | int num_output_layer() const { return slice_stop - slice_start; } 79 | }; 80 | // torch::jit::script::Module predict_model; 81 | // single sequence for now 82 | torch::Tensor expert_access_buffer; 83 | torch::Tensor last_use_distance_buffer; 84 | torch::Tensor weighted_access_freq_sum_buffer; 85 | torch::Tensor first_moe_attn_input_logits_buffer; 86 | std::unordered_map moe_attn_input_logits_buffer_list; 87 | std::unordered_map moe_layer_logits_buffer_list; 88 | std::unordered_map logits_record_event; 89 | std::unordered_map predict_models; 90 | 91 | private: 92 | void init_expert_access_buffer() { 93 | auto options = torch::TensorOptions().dtype(torch::kFloat32); 94 | switch (metas->predict_input_mode) { 95 | case kNoPredict: { break; } 96 | case kOneToken: { 97 | expert_access_buffer = torch::zeros({metas->num_layer, metas->num_expert}, options); 98 | break; 99 | } 100 | case kDecodeCumsum: { 101 | expert_access_buffer = torch::zeros({metas->num_layer, metas->num_expert}, options); 102 | break; 103 | } 104 | case kLastUseDistance: { 105 | last_use_distance_buffer = torch::zeros({metas->num_layer, metas->num_expert}, options); 106 | break; 107 | } 108 | case kWeighedDecodeCumsum: { 109 | weighted_access_freq_sum_buffer = torch::zeros({metas->num_layer, metas->num_expert}, options); 110 | break; 111 | } 112 | case kFirstMoeAttnInputLogits : { break; } 113 | case kMoeAttnInputLogits : { break; } 114 | case kMoeLayerLogits : { break; } 115 | default : { CHECK(false) << "Unknown predict input mode"; } 116 | } 117 | } 118 | 119 | void load_one_model(std::string model_path, int idx = 0); 120 | friend class PredictWorker; 121 | 122 | public: 123 | LegacyPredictor(std::shared_ptr metas); 124 | void load_model_from(std::string model_path) override; 125 | 126 | PredictOutput predict(int input_layer_id) override; 127 | 128 | void add_one_layer(int layer_id, torch::Tensor experts) override; 129 | void add_one_layer(int layer_id, int64_t *experts, size_t num_expert) override; 130 | void record_moe_attn_logits(int layer_id, torch::Tensor attn_logits) override; 131 | void record_moe_layer_logits(int layer_id, torch::Tensor layer_logits) override; 132 | void end_of_one_token_prediction() override; 133 | void start_of_new_sequence() override; 134 | bool layer_predict_enabled(int layer_id) override; 135 | void slice_predict_output_layer(PredictOutput &output) override; 136 | 137 | // void clear_access_buffer() { 138 | // expert_access_buffer.fill_(0); 139 | // last_use_distance_buffer.fill_(0); 140 | // weighted_access_freq_sum_buffer.fill_(0); 141 | // } 142 | }; 143 | 144 | class SepPredictor : public PredictorBase { 145 | private: 146 | struct PredictSepModel { 147 | std::unordered_map models; 148 | // torch::ScalarType dtype = torch::ScalarType::Undefined; 149 | // int input_layer_id; 150 | std::vector enabled_output_layers; 151 | int num_output_layer() const { return enabled_output_layers.size(); } 152 | }; 153 | std::unordered_map moe_layer_logits_buffer_list; 154 | std::unordered_map logits_record_event; 155 | std::unordered_map predict_models; 156 | 157 | private: 158 | friend class PredictWorker; 159 | 160 | public: 161 | SepPredictor(std::shared_ptr metas); 162 | void load_model_from(std::string model_path) override; 163 | 164 | PredictOutput predict(int input_layer_id) override; 165 | 166 | void record_moe_layer_logits(int layer_id, torch::Tensor layer_logits) override; 167 | bool layer_predict_enabled(int layer_id) override { return predict_models[layer_id].enabled_output_layers.size() > 0; } 168 | void slice_predict_output_layer(PredictOutput &output) override; 169 | 170 | int query_predict_jobs(int layer_id) override { return predict_models[layer_id].enabled_output_layers.size(); } 171 | PredictOutput predict_one_job(int layer_id, int job_idx) override; 172 | }; 173 | -------------------------------------------------------------------------------- /src/cpp_worker/prefetcher.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "worker.hpp" 14 | #include "utils.hpp" 15 | #include "model_loader.hpp" 16 | #include "predictor.hpp" 17 | #include "cache.hpp" 18 | #include "profiler.hpp" 19 | 20 | class ExpertHandler; 21 | 22 | class FetchScheduleTaskBase { 23 | public: 24 | enum TaskType { 25 | kIdle, 26 | kPreempt, 27 | kFetchDone, 28 | kPrefetchLayer, 29 | kPreemptOneExpert, 30 | }; 31 | TaskType task_type; 32 | FetchScheduleTaskBase(TaskType task_type) : task_type(task_type) {} 33 | virtual ~FetchScheduleTaskBase() {} 34 | }; 35 | class IdleTask : public FetchScheduleTaskBase { 36 | public: 37 | IdleTask() : FetchScheduleTaskBase(kIdle) {} 38 | }; 39 | class PreemptTask : public FetchScheduleTaskBase { 40 | public: 41 | PreemptTask() : FetchScheduleTaskBase(kPreempt) {} 42 | int layer_idx; 43 | int64_t* expert_idxs; 44 | size_t num_expert; 45 | }; 46 | class PreemptOneExpertTask : public FetchScheduleTaskBase { 47 | public: 48 | PreemptOneExpertTask() : FetchScheduleTaskBase(kPreemptOneExpert) {} 49 | int layer_id; 50 | int64_t expert_id; 51 | }; 52 | class PrefetchLayerTask : public FetchScheduleTaskBase { 53 | public: 54 | PrefetchLayerTask() : FetchScheduleTaskBase(kPrefetchLayer) {} 55 | int layer_idx; 56 | int64_t* expert_idxs; 57 | size_t num_expert; 58 | }; 59 | class FetchDoneTask : public FetchScheduleTaskBase { 60 | public: 61 | FetchDoneTask() : FetchScheduleTaskBase(kFetchDone) {} 62 | }; 63 | 64 | class FetchScheduleWorker : public WorkerThread { 65 | using TaskQueue = Queue; 66 | 67 | ModuleMeta* metas; 68 | ModelLoader* model_loader; 69 | CacheMngr* cache; 70 | 71 | void cache_hit(ExpertHandler* e, bool is_precise) { 72 | profiler->add(is_precise ? TimeProfiler::kHitCnt : TimeProfiler::kPrefetchHitCnt, 1); 73 | cache->hit(e, is_precise); 74 | } 75 | CacheMngr::CacheLineOccupancyWaiter cache_miss(ExpertHandler* e, bool is_precise) { 76 | profiler->add(is_precise ? TimeProfiler::kMissCnt : TimeProfiler::kPrefetchMissCnt, 1); 77 | return cache->miss(e, is_precise); 78 | } 79 | 80 | CacheStatistics* cache_stats; 81 | TimeProfiler* profiler; 82 | 83 | FetchWorker* fetch_thread; 84 | PredictWorker* predict_thread; 85 | 86 | IdleTask idle_task; 87 | CopyTask current_task; // we allow only one ongoing copy task 88 | friend class FetchWorker; 89 | FetchDoneTask copy_done_task; 90 | 91 | #ifdef DEAD_CODE 92 | /** protected by queue_lock */ 93 | AtomicQueueLock task_queue_lock; 94 | #endif 95 | std::vector per_layer_job_queues; // the fetching thread takes out the first task from queue, then execute it. 96 | /** no lock requried */ 97 | TaskQueue precise_job_queue; 98 | 99 | #ifdef DEAD_CODE 100 | inline void lock_task_queue() { task_queue_lock.lock(); } 101 | inline void unlock_task_queue() { task_queue_lock.unlock(); } 102 | #endif 103 | 104 | bool send_one_job(CopyTask *task); 105 | void do_one_task_impl(IdleTask *task); 106 | void do_one_task_impl(PreemptTask *task); 107 | void do_one_task_impl(PreemptOneExpertTask *task); 108 | void do_one_task_impl(FetchDoneTask *task); 109 | void do_one_task_impl(PrefetchLayerTask *task); 110 | 111 | void pop_next_task(CopyTask &task, bool &found); 112 | 113 | void add_single_tasks_for_one_expert(int layer_idx, int expert_idx, TaskQueue* queue, int start_mem_buf_idx, int stop_mem_buf_idx, bool is_precise); 114 | void add_separate_tasks_for_one_expert(int layer_idx, int expert_idx, TaskQueue *queue, int start_mem_buf_idx, int stop_mem_buf_idx, bool is_precise); 115 | 116 | void reorder_experts(int layer_idx, int64_t *expert_idxs, size_t num_expert); 117 | #ifdef DEAD_CODE 118 | void preempt_one_layer_(int layer_idx, int64_t *expert_idxs, size_t num_expert); 119 | #endif 120 | void preempt_one_layer_without_reorder_(int layer_idx, int64_t *expert_idxs, size_t num_expert); 121 | void preempt_one_expert(int layer_idx, int64_t expert_idx); 122 | 123 | public: 124 | #ifdef DEAD_CODE 125 | void add_one_layer_task(int layer_idx, int64_t *expert_idxs, size_t num_expert); 126 | void add_one_layer_task(int layer_idx, torch::Tensor experts); 127 | #endif 128 | void init(ModuleMeta *metas, ModelLoader *model_loader, CacheMngr *cache, FetchWorker *fetch_thread, PredictWorker *predict_thread, CacheStatistics *cache_stats, TimeProfiler* profiler); 129 | 130 | protected: 131 | void do_one_task_impl(FetchScheduleTaskBase *task); 132 | }; 133 | 134 | class PrefetchMngr : public std::enable_shared_from_this { 135 | friend class FetchScheduleWorker; 136 | friend class FetchWorker; 137 | 138 | 139 | std::shared_ptr fetch_schedule_thread; 140 | std::shared_ptr fetch_thread; 141 | std::shared_ptr predict_thread; 142 | std::shared_ptr expert_unlocker_thread; 143 | 144 | /** 145 | * for already in cache, directly lock it 146 | * for fetching, lock it after fetching is done 147 | */ 148 | void preempt_and_launch_one_layer(int layer_idx, int64_t* experts, int64_t num_expert); 149 | void record_then_predict_and_prefetch(int layer_id, int64_t* experts, int64_t num_expert); 150 | 151 | void wait_expert(int layer_id, int expert_id); 152 | void mark_expert_using(int layer_id, int expert_id); 153 | 154 | public: 155 | std::shared_ptr metas; 156 | std::shared_ptr model_loader; 157 | std::shared_ptr predictor; 158 | std::shared_ptr cache_stats; 159 | std::shared_ptr profiler; 160 | std::shared_ptr cache; 161 | 162 | std::shared_ptr precision_profiler; 163 | 164 | int64_t compute_stream = 0, copy_stream = 0; 165 | // cudaStream_t compute_stream = nullptr, copy_stream = nullptr; 166 | // at::cuda::CUDAStream compute_stream, copy_stream; 167 | 168 | PrefetchMngr(std::shared_ptr metas, 169 | std::shared_ptr model_loader, 170 | std::shared_ptr predictor, 171 | int64_t compute_stream = 0, 172 | bool create_compute_stream = true, 173 | TimeProfiler* profiler = nullptr); 174 | ~PrefetchMngr(); 175 | void init_gpu_mem_buffer(); 176 | 177 | void report_one_layer(int layer_id, torch::Tensor experts); 178 | void report_one_layer(int layer_id, int64_t* experts, int64_t num_expert); 179 | void one_moe_layer_done(int layer_id); 180 | 181 | void report_one_expert(int layer_id, int expert_id); 182 | void one_expert_done(int layer_id, int expert_id); 183 | 184 | void report_moe_attn_logits(int layer_id, torch::Tensor attn_logits); 185 | 186 | void report_moe_layer_logits(int layer_id, torch::Tensor layer_logits); 187 | 188 | void launch_thread(); 189 | TimerGuard build_timer() { return TimerGuard(this->profiler.get()); } 190 | void reload_env(); 191 | 192 | void set_compute_stream(int64_t stream); 193 | 194 | void temp_move_expert_to_gpu(int layer_id, int expert_id); 195 | void temp_move_expert_back_to_host(int layer_id, int expert_id); 196 | }; 197 | -------------------------------------------------------------------------------- /src/cpp_worker/profiler.cpp: -------------------------------------------------------------------------------- 1 | #include "profiler.hpp" 2 | #include "logging.hpp" 3 | 4 | bool TraceEventCollector::globally_enabled = false; 5 | TraceEventCollector::TraceEventCollector() { 6 | reload_env(); 7 | event_list.resize(kThreadTypeNum); 8 | } 9 | void TraceEventCollector::add_meta_event() { 10 | { TRACE_EVENT_GURAD_WITH_ARGS(kPythonMain, "thread_name", 'M', arg_var, { arg_var["name"] = "kPythonMain"; }); } 11 | { TRACE_EVENT_GURAD_WITH_ARGS(kHook, "thread_name", 'M', arg_var, { arg_var["name"] = "kHook"; }); } 12 | { TRACE_EVENT_GURAD_WITH_ARGS(kFetchScheduler, "thread_name", 'M', arg_var, { arg_var["name"] = "kFetchScheduler"; }); } 13 | { TRACE_EVENT_GURAD_WITH_ARGS(kFetcher, "thread_name", 'M', arg_var, { arg_var["name"] = "kFetcher"; }); } 14 | { TRACE_EVENT_GURAD_WITH_ARGS(kUnlocker, "thread_name", 'M', arg_var, { arg_var["name"] = "kUnlocker"; }); } 15 | { TRACE_EVENT_GURAD_WITH_ARGS(kCache, "thread_name", 'M', arg_var, { arg_var["name"] = "kCache"; }); } 16 | { TRACE_EVENT_GURAD_WITH_ARGS(kPredictor, "thread_name", 'M', arg_var, { arg_var["name"] = "kPredictor"; }); } 17 | { TRACE_EVENT_GURAD_WITH_ARGS(kGPU, "thread_name", 'M', arg_var, { arg_var["name"] = "kGPU"; }); } 18 | } 19 | void PrecisionProfiler::report() { 20 | std::map> per_layer_rates; 21 | for (int i = 0; i < activated_experts.size(); i++) { 22 | auto &a = activated_experts[i], &p = predicted_experts[i]; 23 | if (a.layer_id != p.layer_id) { 24 | std::cerr << "layer id mismatch " << a.layer_id << " " << p.layer_id << "\n"; 25 | break; 26 | } 27 | // std::cerr << "layer " << a.layer_id << " " 28 | // << "intersect " << a.intersect(p) << " / " << a.experts.size() 29 | // << " / " << p.experts.size() << ", rate " 30 | // << (float)a.intersect(p) / p.experts.size() << "\n"; 31 | if (per_layer_rates.count(a.layer_id) == 0) { 32 | per_layer_rates[a.layer_id] = std::vector(); 33 | } 34 | if (a.experts.size() != decode_expert_per_token) { 35 | continue; 36 | } 37 | if (p.experts.size() == 0) { 38 | continue; 39 | } 40 | per_layer_rates[a.layer_id].push_back((float)a.intersect(p) / p.experts.size()); 41 | } 42 | for (auto &[layer_id, rates] : per_layer_rates) { 43 | double sum = 0; 44 | for (auto r : rates) { 45 | sum += r; 46 | } 47 | std::cerr << "layer " << layer_id << " average rate " << sum / rates.size() << "\n"; 48 | } 49 | } 50 | void PrecisionProfiler::record_activated_experts(int layer_id, const int64_t *experts, size_t num_experts) { 51 | activated_experts.push_back(LayerInfo(layer_id, experts, num_experts)); 52 | } 53 | -------------------------------------------------------------------------------- /src/cpp_worker/worker.cpp: -------------------------------------------------------------------------------- 1 | #include "worker.hpp" 2 | #include "logging.hpp" 3 | #include "profiler.hpp" 4 | #include "prefetcher.hpp" 5 | 6 | void PredictWorker::do_one_task_impl(PredictJob job) { 7 | TRACE_EVENT_GURAD(kPredictor, "predict thread " + std::to_string(job.input_layer_id)); 8 | 9 | int num_predict_jobs = predictor->query_predict_jobs(job.input_layer_id); 10 | 11 | // auto pred_result = predictor->predict(job.input_layer_id); 12 | for (int job_idx = 0; job_idx < num_predict_jobs; job_idx++) { 13 | auto pred_result = predictor->predict_one_job(job.input_layer_id, job_idx); 14 | 15 | predictor->slice_predict_output_layer(pred_result); 16 | 17 | auto num_predicted_layers = pred_result.num_output_layer(); 18 | CHECK(pred_result.num_output_expert() == metas->num_expert || pred_result.num_output_expert() == 0); 19 | if (metas->cache_policy == "nn" && pred_result.num_output_expert() > 0) { 20 | cache->update_priority(pred_result.prob, pred_result.start_output_layer_id); 21 | } 22 | pred_result.rank_experts(metas->num_predict_expert_per_layer); 23 | 24 | LOG_BLOCK(DEBUG, logger, { 25 | logger << "predicted shape " << pred_result.experts.sizes() << "\n"; 26 | }); 27 | LOG_BLOCK(DEBUG, logger, { 28 | for (int l_in_slice = 0; l_in_slice < num_predicted_layers; l_in_slice++) { 29 | int layer_idx = pred_result.start_output_layer_id + l_in_slice; 30 | logger << "predicted expert" << layer_idx << ":" << tensor_to_str(pred_result.experts[l_in_slice]) << "\n"; 31 | } 32 | }); 33 | 34 | { 35 | TRACE_EVENT_GURAD(kPredictor, "add_multi_layer_task [" + std::to_string(pred_result.inner_l_to_outer_l(0)) + "," + std::to_string(pred_result.inner_l_to_outer_l(num_predicted_layers)) + ")"); 36 | size_t per_layer_num_expert = pred_result.num_top_experts(); 37 | for (int inner_l = 0; inner_l < num_predicted_layers; inner_l++) { 38 | int layer_idx = pred_result.inner_l_to_outer_l(inner_l); 39 | precision_profiler->record_predicted_experts(layer_idx, pred_result.top_experts(inner_l), per_layer_num_expert); 40 | } 41 | for (int inner_l = 0; inner_l < num_predicted_layers; inner_l++) { 42 | int layer_idx = pred_result.inner_l_to_outer_l(inner_l); 43 | { 44 | LOG(INFO) << "predict worker: add layer task " << layer_idx << ", wait for budget"; 45 | TRACE_EVENT_GURAD(kPredictor, "wait for budget " + std::to_string(layer_idx)); 46 | while (true) { 47 | int budge_remaining = prefetch_layer_budget.try_pop(true); 48 | if (budge_remaining != -1) { 49 | LOG(INFO) << "predict worker: add layer task now " << layer_idx << ", wait for budget done, remaining " << budge_remaining; 50 | break; 51 | } 52 | if (should_exit()) { return; } 53 | } 54 | } 55 | LOG(INFO) << "predict worker: add layer task now " << layer_idx; 56 | PrefetchLayerTask task; 57 | task.layer_idx = layer_idx; 58 | task.expert_idxs = pred_result.top_experts(inner_l); 59 | if (layer_idx == 0 && metas->limit_layer_0_num_predict != -1) { 60 | task.num_expert = std::min(per_layer_num_expert, metas->limit_layer_0_num_predict); 61 | } else { 62 | task.num_expert = per_layer_num_expert; 63 | } 64 | auto wait_handler = fetch_schedule_thread->add_one_task(&task); 65 | fetch_schedule_thread->wait_progress(wait_handler); 66 | // fetch_schedule_thread->add_one_layer_task(layer_idx, predicted_expert[layer_idx].data_ptr(), per_layer_num_expert); 67 | prefetch_layer_progress.push(layer_idx); 68 | } 69 | } 70 | if (pred_result.inner_l_to_outer_l(num_predicted_layers) == metas->num_layer) { 71 | predictor->end_of_one_token_prediction(); 72 | } 73 | } 74 | // if (metas->predict_input_mode == kOneToken) { 75 | // predictor->clear_access_buffer(); 76 | // } 77 | } 78 | void ExpertUnlockWorker::do_one_task_impl(ExpertHandler *task) { 79 | TRACE_EVENT_GURAD(kUnlocker, "unlock:" + task->toString()); 80 | task->expert_status.wait(kUsing); 81 | CUDA_CALL(cudaEventSynchronize(task->event)); 82 | task->expert_status.transfer(kUsing, kReady); 83 | } 84 | void FetchWorker::do_one_task_impl(CopyTask *task) { 85 | TRACE_EVENT_GURAD(kFetcher, "fetch:" + task->toString()); 86 | LOG(TRACE) << "fetcher: copying " << task->toString(); 87 | { 88 | task->lambda_wait(); 89 | } 90 | for (int mem_buf_idx = task->start_mem_buf_idx; mem_buf_idx < task->stop_mem_buf_idx; mem_buf_idx++) { 91 | // LOG(ERROR) << "fetcher: copy from " << task->expert->host_data.ptr(mem_buf_idx) << " to " << task->expert->gpu_data->ptr(mem_buf_idx); 92 | CUDA_CALL(cudaMemcpyAsync( 93 | task->expert->gpu_data->ptr(mem_buf_idx), 94 | task->expert->host_data->ptr(mem_buf_idx), 95 | task->expert->host_data->nbytes(mem_buf_idx), 96 | cudaMemcpyHostToDevice, this->stream)); 97 | } 98 | if (task->start_mem_buf_idx == 0) { 99 | task->expert->reference_to_model_param->unmap(); 100 | task->expert->reference_to_model_param->map_to(task->expert->gpu_data, mem_mngr_ctx); 101 | } 102 | 103 | CUDA_CALL(cudaStreamSynchronize(this->stream)); 104 | fetch_schedule_thread->add_one_task(&fetch_schedule_thread->copy_done_task); 105 | } 106 | void PredictWorker::add_prefetch_layer_budget() { 107 | LOG(DEBUG) << "predict worker: add prefetch layer budget"; 108 | prefetch_layer_budget.push(0); 109 | } 110 | void PredictWorker::on_one_iter_done() { 111 | LOG(DEBUG) << "predict workers, one iter done"; 112 | switch (metas->predict_input_mode) { 113 | // case kNoPredict: { break; } 114 | case kNoPredict: { add_one_task(PredictJob()); break; } 115 | case kOneToken: { add_one_task(PredictJob()); break; } 116 | case kDecodeCumsum: { add_one_task(PredictJob()); break; } 117 | case kLastUseDistance: { add_one_task(PredictJob()); break; } 118 | case kWeighedDecodeCumsum: { add_one_task(PredictJob()); break; } 119 | case kFirstMoeAttnInputLogits: { break; } 120 | case kMoeAttnInputLogits: { break; } 121 | case kMoeLayerLogits: { break; } 122 | default: { CHECK(false) << "Unknown predict input mode"; } 123 | } 124 | } 125 | void PredictWorker::on_moe_attn_input_logits_recorded(int layer_id) { 126 | LOG(DEBUG) << "predict workers, on_moe_attn_input_logits_recorded " << layer_id; 127 | switch (metas->predict_input_mode) { 128 | case kNoPredict: { break; } 129 | case kOneToken: { break;} 130 | case kDecodeCumsum: { break;} 131 | case kLastUseDistance: { break;} 132 | case kWeighedDecodeCumsum: { break;} 133 | case kFirstMoeAttnInputLogits: { 134 | if (layer_id == 0) { add_one_task(PredictJob()); } 135 | break; 136 | } 137 | case kMoeAttnInputLogits: { 138 | if (predictor->layer_predict_enabled(layer_id)) { 139 | // if (layer_id % metas->layer_predict_interval == 0) { 140 | add_one_task(PredictJob(layer_id)); 141 | } 142 | break; 143 | } 144 | case kMoeLayerLogits: { break; } 145 | default: { CHECK(false) << "Unknown predict input mode"; } 146 | } 147 | } 148 | void PredictWorker::on_moe_layer_logits_recorded(int layer_id) { 149 | LOG(DEBUG) << "predict workers, on_moe_layer_logits_recorded " << layer_id; 150 | switch (metas->predict_input_mode) { 151 | case kNoPredict: { break; } 152 | case kOneToken: { break;} 153 | case kDecodeCumsum: { break;} 154 | case kLastUseDistance: { break;} 155 | case kWeighedDecodeCumsum: { break;} 156 | case kFirstMoeAttnInputLogits: { break;} 157 | case kMoeAttnInputLogits: { break;} 158 | case kMoeLayerLogits: { 159 | if (predictor->layer_predict_enabled(layer_id)) { 160 | add_one_task(PredictJob(layer_id)); 161 | } 162 | break; 163 | } 164 | default: { CHECK(false) << "Unknown predict input mode in on_moe_layer_logits_recorded"; } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/cpp_worker/worker.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "cache.hpp" 8 | #include "utils.hpp" 9 | #include "model_loader.hpp" 10 | #include "predictor.hpp" 11 | #include "profiler.hpp" 12 | #include 13 | 14 | template 15 | class WorkerThreadBase { 16 | protected: 17 | Queue queue; 18 | std::thread worker_thread; 19 | std::atomic exit_mark_atomic{false}; 20 | using progress_handler_t = int64_t; 21 | progress_handler_t queued = 0; 22 | std::atomic progress{0}; 23 | protected: 24 | bool should_exit() { 25 | return std::atomic_load_explicit(&exit_mark_atomic, std::memory_order_relaxed); 26 | } 27 | virtual void do_one_task_impl(TASK_T task) {} 28 | inline void do_one_task(TASK_T task) { 29 | do_one_task_impl(task); 30 | progress.fetch_add(1); 31 | } 32 | public: 33 | WorkerThreadBase() : progress(0) {} 34 | virtual void exit() = 0; 35 | void launch() { 36 | worker_thread = std::thread([this](){ 37 | thread_func(); 38 | }); 39 | } 40 | protected: 41 | virtual void thread_func() = 0; 42 | public: 43 | void set_cpu_affinity(std::vector cpu_ids) { 44 | if (worker_thread.native_handle()) { 45 | cpu_set_t cpuset; 46 | CPU_ZERO(&cpuset); 47 | for (auto cpu_id : cpu_ids) { 48 | CPU_SET(cpu_id, &cpuset); 49 | } 50 | int result = pthread_setaffinity_np(worker_thread.native_handle(), sizeof(cpu_set_t), &cpuset); 51 | if (result != 0) { 52 | std::cerr << "Error setting thread affinity: " << std::strerror(result) << std::endl; 53 | } 54 | } 55 | } 56 | virtual progress_handler_t add_one_task(TASK_T task) = 0; 57 | void wait_progress(progress_handler_t handle) { 58 | // todo: handle overflow 59 | while(progress.load() <= handle) {}; 60 | // while(progress.load() <= handle) { std::this_thread::yield(); } 61 | } 62 | virtual ~WorkerThreadBase() {} 63 | }; 64 | 65 | template 66 | class WorkerThreadMutex : public WorkerThreadBase { 67 | std::mutex queue_mutex; 68 | std::condition_variable cv; 69 | public: 70 | WorkerThreadMutex() : WorkerThreadBase() {} 71 | void exit() override { 72 | std::atomic_store_explicit(&this->exit_mark_atomic, true, std::memory_order_relaxed); 73 | cv.notify_one(); 74 | if (this->worker_thread.joinable()) { this->worker_thread.join(); } 75 | } 76 | protected: 77 | void thread_func() override { 78 | while (!this->should_exit()) { 79 | std::unique_lock lock(queue_mutex); 80 | cv.wait(lock, [this]{ return !this->queue.empty() || this->should_exit(); }); 81 | 82 | if (this->should_exit()) { 83 | break; 84 | } 85 | 86 | auto current_task = this->queue.front(); 87 | this->queue.pop(); 88 | lock.unlock(); 89 | 90 | this->do_one_task(current_task); 91 | } 92 | } 93 | public: 94 | using progress_handler_t = typename WorkerThreadBase::progress_handler_t; 95 | progress_handler_t add_one_task(TASK_T task) override { 96 | std::lock_guard lock(queue_mutex); 97 | auto ret = this->queued++; 98 | this->queue.push(task); 99 | cv.notify_one(); 100 | return ret; 101 | } 102 | }; 103 | 104 | 105 | 106 | template 107 | class WorkerThreadSpin : public WorkerThreadBase { 108 | AtomicQueueLock queue_lock; 109 | public: 110 | WorkerThreadSpin() : WorkerThreadBase() {} 111 | void exit() override { 112 | std::atomic_store_explicit(&this->exit_mark_atomic, true, std::memory_order_relaxed); 113 | if (this->worker_thread.joinable()) { this->worker_thread.join(); } 114 | } 115 | protected: 116 | void thread_func() override { 117 | while (!this->should_exit()) { 118 | queue_lock.lock(); 119 | if (this->queue.empty()) { 120 | queue_lock.unlock(); 121 | // usleep(10); 122 | } else { 123 | auto current_task = this->queue.front(); 124 | this->queue.pop(); 125 | queue_lock.unlock(); 126 | this->do_one_task(current_task); 127 | } 128 | } 129 | } 130 | public: 131 | using progress_handler_t = typename WorkerThreadBase::progress_handler_t; 132 | progress_handler_t add_one_task(TASK_T task) override { 133 | queue_lock.lock(); 134 | auto ret = this->queued++; 135 | this->queue.push(task); 136 | queue_lock.unlock(); 137 | return ret; 138 | } 139 | }; 140 | 141 | template 142 | using WorkerThread = WorkerThreadSpin; 143 | 144 | class BaseTask { 145 | public: 146 | }; 147 | 148 | class PrefetchMngr; 149 | /** 150 | * Param Fetcher 151 | */ 152 | class CopyTask : public BaseTask { 153 | public: 154 | int start_mem_buf_idx, stop_mem_buf_idx; 155 | bool is_precise = false; 156 | ExpertHandler *expert = nullptr; 157 | CacheMngr::CacheLineOccupancyWaiter lambda_wait = [](){}; 158 | std::string toString() const { 159 | std::stringstream ss; 160 | if (expert) { 161 | ss << expert->toString() << ".[" << start_mem_buf_idx << "," << stop_mem_buf_idx << "), precise " << (is_precise?"true":"false"); 162 | } else { 163 | ss << "null"; 164 | } 165 | return ss.str(); 166 | } 167 | }; 168 | 169 | class FetchScheduleWorker; 170 | 171 | class FetchWorker : public WorkerThread { 172 | ModuleMeta* metas; 173 | FetchScheduleWorker* fetch_schedule_thread; 174 | MemMngrCtx* mem_mngr_ctx; 175 | cudaStream_t stream; 176 | friend class PrefetchMngr; 177 | public: 178 | void init(ModuleMeta* metas, FetchScheduleWorker* fetch_schedule_thread, MemMngrCtx* mem_mngr_ctx, cudaStream_t stream) { 179 | this->metas = metas; 180 | this->fetch_schedule_thread = fetch_schedule_thread; 181 | this->stream = stream; 182 | this->mem_mngr_ctx = mem_mngr_ctx; 183 | } 184 | protected: 185 | void do_one_task_impl(CopyTask *task) override; 186 | }; 187 | 188 | /** 189 | * Expert Unlocker, which tracks expert forward progress in python 190 | */ 191 | class ExpertUnlockWorker : public WorkerThread { 192 | protected: 193 | void do_one_task_impl(ExpertHandler *task) override; 194 | }; 195 | 196 | /** 197 | * Predict Worker 198 | */ 199 | struct PredictJob { 200 | int input_layer_id = 0; 201 | PredictJob() {} 202 | PredictJob(int input_layer_id) : input_layer_id(input_layer_id) {} 203 | }; 204 | class AtomicQueue { 205 | public: 206 | std::queue queue_; 207 | AtomicQueueLock lock_; 208 | void init(int init_val = 0) { 209 | for (int i = 0; i < init_val; i++) { 210 | queue_.push(i); 211 | } 212 | } 213 | void lock() { 214 | lock_.lock(); 215 | } 216 | void unlock() { 217 | lock_.unlock(); 218 | } 219 | void push(int task) { 220 | lock(); 221 | queue_.push(task); 222 | unlock(); 223 | } 224 | int pop(bool return_size = false) { 225 | while (true) { 226 | lock(); 227 | if (queue_.empty()) { 228 | unlock(); 229 | } else { 230 | int task = queue_.front(); 231 | queue_.pop(); 232 | int size = queue_.size(); 233 | unlock(); 234 | if (return_size) { 235 | return size; 236 | } 237 | return task; 238 | } 239 | } 240 | } 241 | int try_pop(bool return_size = false) { 242 | lock(); 243 | if (queue_.empty()) { 244 | unlock(); 245 | return -1; 246 | } 247 | int task = queue_.front(); 248 | queue_.pop(); 249 | int size = queue_.size(); 250 | unlock(); 251 | if (return_size) { 252 | return size; 253 | } 254 | return task; 255 | } 256 | }; 257 | 258 | class SemQueue { 259 | public: 260 | sem_t sem_; 261 | SemQueue() {} 262 | void init(int init_val = 0) { 263 | sem_init(&sem_, 0, init_val); 264 | } 265 | ~SemQueue() { 266 | sem_destroy(&sem_); 267 | } 268 | 269 | void push(int task) { 270 | sem_post(&sem_); 271 | } 272 | int pop(bool return_size = false) { 273 | sem_wait(&sem_); 274 | return 0; 275 | } 276 | int try_pop(bool return_size = false) { 277 | if (sem_trywait(&sem_) == 0) { 278 | // success 279 | return 0; 280 | } 281 | // failed 282 | return -1; 283 | } 284 | }; 285 | 286 | class PredictWorker : public WorkerThread { 287 | FetchScheduleWorker* fetch_schedule_thread; 288 | PredictorBase * predictor; 289 | CacheMngr * cache; 290 | ModuleMeta * metas; 291 | 292 | PrecisionProfiler * precision_profiler; 293 | 294 | SemQueue prefetch_layer_budget; 295 | SemQueue prefetch_layer_progress; 296 | // AtomicQueue prefetch_layer_budget; 297 | // AtomicQueue prefetch_layer_progress; 298 | 299 | friend class PrefetchMngr; 300 | public: 301 | PredictWorker() : WorkerThread() {} 302 | void init(FetchScheduleWorker* fetch_schedule_thread, PredictorBase* predictor, CacheMngr* cache, ModuleMeta* metas) { 303 | this->fetch_schedule_thread = fetch_schedule_thread; 304 | this->predictor = predictor; 305 | this->cache = cache; 306 | this->metas = metas; 307 | 308 | prefetch_layer_budget.init(metas->max_prefetch_layer_distance); 309 | prefetch_layer_progress.init(0); 310 | } 311 | void add_prefetch_layer_budget(); 312 | int consume_prefetch_layer_progress() { 313 | return prefetch_layer_progress.pop(); 314 | } 315 | void on_one_iter_done(); 316 | void on_moe_attn_input_logits_recorded(int layer_id); 317 | void on_moe_layer_logits_recorded(int layer_id); 318 | 319 | protected: 320 | void do_one_task_impl(PredictJob job) override; 321 | }; 322 | -------------------------------------------------------------------------------- /src/sparse_llm_cache/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | from . import cpp_worker -------------------------------------------------------------------------------- /src/sparse_llm_cache/expert_cache_inject_accelerate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from functools import wraps 3 | 4 | from accelerate.hooks import AlignDevicesHook, SequentialHook, ModelHook 5 | from accelerate.utils import set_module_tensor_to_device 6 | from torch.nn import Module 7 | from expert_selection_tracer.call_tracer import report_called 8 | from collections import OrderedDict 9 | 10 | def _handle_acc_hook_offload(hook): 11 | if hasattr(hook, "_old_offload"): 12 | return 13 | hook._old_offload = hook.offload 14 | hook.offload = False 15 | 16 | class CacheOffloadHook(ModelHook): 17 | def __init__(self, weights_map): 18 | self.weights_map = weights_map 19 | def pre_forward(self, module, *args, **kwargs): 20 | ExpertCacheMngr.inst().get(self.weights_map.prefix) 21 | return args, kwargs 22 | 23 | def _replace_one_hook_offload(hook: AlignDevicesHook, module): 24 | _handle_acc_hook_offload(hook) 25 | if hook._old_offload: 26 | ExpertCacheMngr.inst().module_map[hook.weights_map.prefix] = module 27 | ExpertCacheMngr.inst().hook_map[hook.weights_map.prefix] = hook 28 | module._hf_hook = SequentialHook(module._hf_hook, CacheOffloadHook(hook.weights_map)) 29 | 30 | # def recursively_replace_pre_forward(module): 31 | # did_replace = False 32 | # if hasattr(module, "_hf_hook"): 33 | # if isinstance(module._hf_hook, AlignDevicesHook): 34 | # did_replace = _replace_one_hook_offload(module._hf_hook, module) or did_replace 35 | # elif isinstance(module._hf_hook, SequentialHook): 36 | # for hook in module._hf_hook.hooks: 37 | # if isinstance(hook, AlignDevicesHook): 38 | # did_replace = _replace_one_hook_offload(hook, module) or did_replace 39 | # for n,m in module.named_modules(): 40 | # if n == "": continue 41 | # did_replace = recursively_replace_pre_forward(m) or did_replace 42 | # if did_replace: 43 | # print(n) 44 | # return did_replace 45 | 46 | def replace_pre_forward(model): 47 | # fixed: named_module is already recursive 48 | for n,module in model.named_modules(): 49 | if n == "": continue 50 | if hasattr(module, "_hf_hook"): 51 | if isinstance(module._hf_hook, AlignDevicesHook): 52 | _replace_one_hook_offload(module._hf_hook, module) 53 | elif isinstance(module._hf_hook, SequentialHook): 54 | for hook in module._hf_hook.hooks: 55 | if isinstance(hook, AlignDevicesHook): 56 | _replace_one_hook_offload(hook, module) 57 | 58 | 59 | class LRUPolicy: 60 | def __init__(self): 61 | # from old to new 62 | self.last_use_order = OrderedDict() 63 | pass 64 | def _choose_to_evict(self): 65 | return next(iter(self.last_use_order)) 66 | def _evict(self, key): 67 | self.last_use_order.pop(key) 68 | def _access(self, key): 69 | if key in self.last_use_order: 70 | self.last_use_order.move_to_end(key) 71 | else: 72 | self.last_use_order[key] = None 73 | def clear(self): 74 | self.last_use_order.clear() 75 | 76 | ''' 77 | Currently supports single device 78 | ''' 79 | 80 | class ExpertCacheMngr: 81 | _inst = None 82 | 83 | @staticmethod 84 | def set(o) : 85 | ExpertCacheMngr._inst = o 86 | 87 | @staticmethod 88 | def inst(): 89 | return ExpertCacheMngr._inst 90 | 91 | def __init__(self, cache_len = 24, exec_device = 0, policy_cls = LRUPolicy): 92 | self.module_map = {} 93 | self.hook_map = {} 94 | self.cached_map : dict[str, any] = {} 95 | self.cache_len = cache_len 96 | self.exec_device = exec_device 97 | self.policy = policy_cls() 98 | def get(self, key : str) -> None: 99 | if key in self.cached_map: 100 | self._handle_hit(key) 101 | else: 102 | self._handle_miss(key) 103 | self._access(key) 104 | 105 | 106 | def _locate_key_to_evict(self) -> str: 107 | return self.policy._choose_to_evict() 108 | 109 | def _evict(self, key, _) -> None: 110 | module = self.module_map[key] 111 | self.cached_map.pop(key) 112 | self.policy._evict(key) 113 | for name, _ in module.named_parameters(): 114 | set_module_tensor_to_device(module, name, "meta") 115 | 116 | def clear_cache(self, simulate_evict = False) -> None: 117 | if simulate_evict: 118 | while len(self.cached_map) > 0: 119 | self._evict(self._locate_key_to_evict()) 120 | else: 121 | self.cached_map.clear() 122 | self.policy.clear() 123 | 124 | def _access(self, key: str) -> None: 125 | self.policy._access(key) 126 | 127 | def _handle_hit(self, key: str)-> None: 128 | pass 129 | 130 | def _handle_miss(self, key : str) -> None: 131 | module = self.module_map[key] 132 | weights_map = self.hook_map[key].weights_map 133 | if len(self.cached_map) >= self.cache_len: 134 | key_to_evict = self._locate_key_to_evict() 135 | self._evict(key_to_evict, module) 136 | self.cached_map[key] = weights_map 137 | for name, _ in module.named_parameters(): 138 | set_module_tensor_to_device(module, name, self.exec_device, value = weights_map[name]) -------------------------------------------------------------------------------- /src/sparse_llm_cache/oracle_cache_policy.py: -------------------------------------------------------------------------------- 1 | from bisect import bisect 2 | from dataclasses import dataclass 3 | import json 4 | from math import inf 5 | import os 6 | import tqdm 7 | 8 | from sortedcontainers import SortedList 9 | 10 | @dataclass 11 | class Config: 12 | n_encoder_layer : int 13 | n_decoder_layer : int 14 | first_expert : int 15 | n_fc : int = 2 16 | expert_step : int = 4 17 | num_expert : int = 128 18 | def num_encoder_moe_layer(self): 19 | return self.n_encoder_layer // self.expert_step 20 | def num_decoder_moe_layer(self): 21 | return self.n_decoder_layer // self.expert_step 22 | def is_encoder_moe_layer(self, moe_layer_id): 23 | return moe_layer_id < self.num_encoder_moe_layer() 24 | def moe_layer_id_to_full_layer_id(self, moe_layer_id): 25 | if not self.is_encoder_moe_layer(moe_layer_id): 26 | moe_layer_id -= self.num_encoder_moe_layer() 27 | return moe_layer_id * self.expert_step + self.first_expert 28 | def iterate_decoder_moe_layer(self): 29 | return range( 30 | self.num_encoder_moe_layer(), 31 | self.num_encoder_moe_layer() + self.num_decoder_moe_layer() 32 | ) 33 | 34 | nllb_config = Config(n_decoder_layer=24, n_encoder_layer=24, n_fc=2, expert_step=4,first_expert=3,num_expert=128) 35 | 36 | def meta_to_module_key(config: Config, moe_layer_id, e_id, fc): 37 | if config.is_encoder_moe_layer(moe_layer_id) : 38 | e_or_d_str = "encoder" 39 | else: 40 | e_or_d_str = "decoder" 41 | layer_id = config.moe_layer_id_to_full_layer_id(moe_layer_id) 42 | return f'model.{e_or_d_str}.layers.{layer_id}.ffn.experts.expert_{e_id}.fc{fc+1}.' 43 | 44 | def record_one_seq_time_key(seq_id, key, time): 45 | print(seq_id, time, key) 46 | 47 | def traverse_one_seq_time(j, seq_id, config: Config, recorder): 48 | seq_time = 0 49 | 50 | for encoder_moe_layer_id in range(config.num_encoder_moe_layer()): 51 | dedup_expert_in_cur_layer = [ 52 | e for _, expert_list in j[seq_id][str(encoder_moe_layer_id)].items() 53 | for e in expert_list 54 | ] 55 | dedup_expert_in_cur_layer = list(set(dedup_expert_in_cur_layer)) 56 | dedup_expert_in_cur_layer.sort() 57 | for eid in dedup_expert_in_cur_layer: 58 | for fc in range(config.n_fc): 59 | # time = (encoder_moe_layer_id, prompt_token_idx, eid, fc) 60 | key = meta_to_module_key(config, encoder_moe_layer_id, eid, fc) 61 | recorder(seq_id, key, seq_time) 62 | seq_time += 1 63 | 64 | for rply_token_idx in range(get_rply_len(j, seq_id, config)): 65 | for decoder_moe_layer_id in config.iterate_decoder_moe_layer(): 66 | for eid in j[seq_id][str(decoder_moe_layer_id)][str(rply_token_idx)]: 67 | for fc in range(config.n_fc): 68 | # time = (rply_token_idx, decoder_moe_layer_id, eid, fc) 69 | key = meta_to_module_key(config, decoder_moe_layer_id, eid, fc) 70 | recorder(seq_id, key, seq_time) 71 | seq_time += 1 72 | 73 | 74 | 75 | def get_rply_len(j, seq_id, config : Config): 76 | return len(j[seq_id][str(config.num_encoder_moe_layer())]) 77 | 78 | class OraclePolicy: 79 | @staticmethod 80 | def tqdm_wrapper(o): 81 | if 'DISABLE_MOE_CACHE_TQDM' in os.environ: 82 | return o 83 | else: 84 | return tqdm.tqdm(o) 85 | def __init__(self) -> None: 86 | 87 | self.next_use_time_map = {} 88 | self.cur_time = 0 89 | self.cur_seq_id = None 90 | 91 | # list of tuple(next_use_time, key) 92 | self.next_use_time_queue = SortedList() 93 | 94 | def set_cur_seq_id(self, seq_id): 95 | self.cur_seq_id = seq_id 96 | self.cur_time = -1 97 | self.next_use_time_queue.clear() 98 | for key in self.next_use_time_map: 99 | self.next_use_time_map[key] = self._find_next_use_time(key, -1) 100 | self.next_use_time_queue.add((self.next_use_time_map[key], key)) 101 | 102 | def load_expert_trace(self, fname): 103 | with open(fname) as f: 104 | self.original_expert_history = json.load(f) 105 | if "nllb" in fname: 106 | self.config = nllb_config 107 | else: 108 | raise RuntimeError("Unimplemented") 109 | self.module_use_time = {} 110 | def recorder(seq_id, key, time): 111 | if seq_id not in self.module_use_time: 112 | self.module_use_time[seq_id] = {} 113 | if key not in self.module_use_time[seq_id]: 114 | self.module_use_time[seq_id][key] = [] 115 | self.module_use_time[seq_id][key].append(time) 116 | for seq_id in self.tqdm_wrapper(self.original_expert_history): 117 | traverse_one_seq_time(self.original_expert_history, seq_id, self.config, recorder) 118 | 119 | def _choose_to_evict(self): 120 | item = self.next_use_time_queue[-1] 121 | return item[1] 122 | 123 | def _evict(self, key): 124 | assert(self._choose_to_evict() == key) 125 | self.next_use_time_queue.pop(-1) 126 | self.next_use_time_map.pop(key) 127 | 128 | def _find_next_use_time(self, key, cur_time): 129 | if key not in self.module_use_time[self.cur_seq_id]: 130 | return inf 131 | history_time = self.module_use_time[self.cur_seq_id][key] 132 | idx = bisect(history_time, cur_time) 133 | assert(idx == len(history_time) or history_time[idx] > cur_time) 134 | assert(idx == 0 or history_time[idx - 1] <= cur_time) 135 | if idx == len(history_time): 136 | return inf 137 | else: 138 | return history_time[idx] 139 | 140 | def _access(self, key): 141 | if key in self.next_use_time_map: 142 | # is in cache, hit 143 | # must be the nearest one 144 | assert(self.next_use_time_queue[0][0] == self.cur_time + 1) 145 | assert(self.next_use_time_queue[0][1] == key) 146 | 147 | self.cur_time = self.next_use_time_map[key] 148 | 149 | next_use_time = self._find_next_use_time(key, self.cur_time) 150 | 151 | self.next_use_time_map[key] = next_use_time 152 | self.next_use_time_queue.pop(0) 153 | self.next_use_time_queue.add((next_use_time, key)) 154 | else: 155 | assert(self.cur_time + 1 == self._find_next_use_time(key, self.cur_time)) 156 | self.cur_time = self._find_next_use_time(key, self.cur_time) 157 | next_use_time = self._find_next_use_time(key, self.cur_time) 158 | self.next_use_time_map[key] = next_use_time 159 | self.next_use_time_queue.add((next_use_time, key)) 160 | 161 | def clear(self): 162 | self.next_use_time_map.clear() 163 | self.next_use_time_queue.clear() -------------------------------------------------------------------------------- /src/sparse_llm_cache/prefetch/oracle_prefetch_policy.py: -------------------------------------------------------------------------------- 1 | from bisect import bisect 2 | from dataclasses import dataclass 3 | import json 4 | from math import inf 5 | import os 6 | import tqdm 7 | 8 | from sortedcontainers import SortedList 9 | 10 | @dataclass 11 | class Config: 12 | n_encoder_layer : int 13 | n_decoder_layer : int 14 | first_expert : int 15 | n_fc : int = 2 16 | expert_step : int = 4 17 | num_expert : int = 128 18 | def num_encoder_moe_layer(self): 19 | return self.n_encoder_layer // self.expert_step 20 | def num_decoder_moe_layer(self): 21 | return self.n_decoder_layer // self.expert_step 22 | def is_encoder_moe_layer(self, moe_layer_id): 23 | return moe_layer_id < self.num_encoder_moe_layer() 24 | def moe_layer_id_to_full_layer_id(self, moe_layer_id): 25 | if not self.is_encoder_moe_layer(moe_layer_id): 26 | moe_layer_id -= self.num_encoder_moe_layer() 27 | return moe_layer_id * self.expert_step + self.first_expert 28 | def iterate_decoder_moe_layer(self): 29 | return range( 30 | self.num_encoder_moe_layer(), 31 | self.num_encoder_moe_layer() + self.num_decoder_moe_layer() 32 | ) 33 | 34 | nllb_config = Config(n_decoder_layer=24, n_encoder_layer=24, n_fc=2, expert_step=4,first_expert=3,num_expert=128) 35 | 36 | def meta_to_module_key(config: Config, moe_layer_id, e_id, fc): 37 | if config.is_encoder_moe_layer(moe_layer_id) : 38 | e_or_d_str = "encoder" 39 | else: 40 | e_or_d_str = "decoder" 41 | layer_id = config.moe_layer_id_to_full_layer_id(moe_layer_id) 42 | return f'model.{e_or_d_str}.layers.{layer_id}.ffn.experts.expert_{e_id}.fc{fc+1}.' 43 | 44 | def record_one_seq_time_key(seq_id, key, time): 45 | print(seq_id, time, key) 46 | 47 | def traverse_one_seq_time(j, seq_id, config: Config, recorder): 48 | seq_time = 0 49 | 50 | for encoder_moe_layer_id in range(config.num_encoder_moe_layer()): 51 | dedup_expert_in_cur_layer = [ 52 | e for _, expert_list in j[seq_id][str(encoder_moe_layer_id)].items() 53 | for e in expert_list 54 | ] 55 | dedup_expert_in_cur_layer = list(set(dedup_expert_in_cur_layer)) 56 | dedup_expert_in_cur_layer.sort() 57 | for eid in dedup_expert_in_cur_layer: 58 | for fc in range(config.n_fc): 59 | # time = (encoder_moe_layer_id, prompt_token_idx, eid, fc) 60 | key = meta_to_module_key(config, encoder_moe_layer_id, eid, fc) 61 | recorder(seq_id, key, seq_time) 62 | seq_time += 1 63 | 64 | for rply_token_idx in range(get_rply_len(j, seq_id, config)): 65 | for decoder_moe_layer_id in config.iterate_decoder_moe_layer(): 66 | for eid in j[seq_id][str(decoder_moe_layer_id)][str(rply_token_idx)]: 67 | for fc in range(config.n_fc): 68 | # time = (rply_token_idx, decoder_moe_layer_id, eid, fc) 69 | key = meta_to_module_key(config, decoder_moe_layer_id, eid, fc) 70 | recorder(seq_id, key, seq_time) 71 | seq_time += 1 72 | 73 | 74 | def get_prompt_len(j, seq_id): 75 | prompt_len = len(j[seq_id]['0']) 76 | while len(j[seq_id]['0'][prompt_len - 1]) == 0: 77 | prompt_len -= 1 78 | return prompt_len 79 | 80 | def get_rply_len(j, seq_id, config : Config): 81 | return len(j[seq_id][str(config.num_encoder_moe_layer())]) 82 | 83 | class OraclePolicy: 84 | @staticmethod 85 | def tqdm_wrapper(o): 86 | if 'DISABLE_MOE_CACHE_TQDM' in os.environ: 87 | return o 88 | else: 89 | return tqdm.tqdm(o) 90 | def __init__(self) -> None: 91 | 92 | self.next_use_time_map = {} 93 | self.cur_time = 0 94 | self.cur_seq_id = None 95 | 96 | # list of tuple(next_use_time, key) 97 | self.next_use_time_queue = SortedList() 98 | 99 | def set_cur_seq_id(self, seq_id): 100 | self.cur_seq_id = seq_id 101 | self.cur_time = -1 102 | self.next_use_time_queue.clear() 103 | for key in self.next_use_time_map: 104 | self.next_use_time_map[key] = self._find_next_use_time(key, -1) 105 | self.next_use_time_queue.add((self.next_use_time_map[key], key)) 106 | 107 | def load_expert_trace(self, fname): 108 | with open(fname) as f: 109 | self.original_expert_history = json.load(f) 110 | if "nllb" in fname: 111 | self.config = nllb_config 112 | else: 113 | raise RuntimeError("Unimplemented") 114 | self.module_use_time = {} 115 | self.time_to_module = {} 116 | def recorder(seq_id, key, time): 117 | if seq_id not in self.module_use_time: 118 | self.module_use_time[seq_id] = {} 119 | self.time_to_module[seq_id] = [] 120 | if key not in self.module_use_time[seq_id]: 121 | self.module_use_time[seq_id][key] = [] 122 | self.module_use_time[seq_id][key].append(time) 123 | assert(len(self.time_to_module[seq_id]) == time) 124 | self.time_to_module[seq_id].append(key) 125 | 126 | for seq_id in self.tqdm_wrapper(self.original_expert_history): 127 | traverse_one_seq_time(self.original_expert_history, seq_id, self.config, recorder) 128 | 129 | def _choose_to_evict(self): 130 | item = self.next_use_time_queue[-1] 131 | return item[1] 132 | 133 | def _evict(self, key): 134 | assert(self._choose_to_evict() == key) 135 | self.next_use_time_queue.pop(-1) 136 | self.next_use_time_map.pop(key) 137 | 138 | def _find_next_use_time(self, key, cur_time): 139 | if key not in self.module_use_time[self.cur_seq_id]: 140 | return inf 141 | history_time = self.module_use_time[self.cur_seq_id][key] 142 | idx = bisect(history_time, cur_time) 143 | assert(idx == len(history_time) or history_time[idx] > cur_time) 144 | assert(idx == 0 or history_time[idx - 1] <= cur_time) 145 | if idx == len(history_time): 146 | return inf 147 | else: 148 | return history_time[idx] 149 | 150 | def _access(self, key): 151 | if key in self.next_use_time_map: 152 | # is in cache, hit 153 | # must be the nearest one 154 | assert(self.next_use_time_queue[0][0] == self.cur_time + 1) 155 | assert(self.next_use_time_queue[0][1] == key) 156 | 157 | self.cur_time = self.next_use_time_map[key] 158 | 159 | next_use_time = self._find_next_use_time(key, self.cur_time) 160 | 161 | self.next_use_time_map[key] = next_use_time 162 | self.next_use_time_queue.pop(0) 163 | self.next_use_time_queue.add((next_use_time, key)) 164 | else: 165 | assert(self.cur_time + 1 == self._find_next_use_time(key, self.cur_time)) 166 | self.cur_time = self._find_next_use_time(key, self.cur_time) 167 | next_use_time = self._find_next_use_time(key, self.cur_time) 168 | self.next_use_time_map[key] = next_use_time 169 | self.next_use_time_queue.add((next_use_time, key)) 170 | 171 | def _predict_next(self): 172 | if len(self.time_to_module[self.cur_seq_id]) > self.cur_time + 1: 173 | return self.time_to_module[self.cur_seq_id][self.cur_time + 1] 174 | else: 175 | return None 176 | 177 | def clear(self): 178 | self.next_use_time_map.clear() 179 | self.next_use_time_queue.clear() -------------------------------------------------------------------------------- /src/sparse_llm_cache/profile/profiler.py: -------------------------------------------------------------------------------- 1 | from torch.profiler import profile, record_function, ProfilerActivity 2 | # profiler = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True, with_stack=True) 3 | # profiler = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) 4 | # profiler, profile_fname = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_modules=True), "nllb-transformer-trace-module.json" 5 | # profiler, profile_fname = profile(activities=[ProfilerActivity.CUDA], with_stack=True), "nllb-transformer-trace-cuda-only-stack.json" 6 | 7 | _global_profiler : profile = None 8 | # profile_fname = "nllb-transformer-trace-stack-short.json" 9 | 10 | def create_profile(record_shapes=True,profile_memory=True,with_stack=True, with_cpu=True, with_cuda=True): 11 | global _global_profiler 12 | activities = [] 13 | if with_cpu: 14 | activities.append(ProfilerActivity.CPU) 15 | if with_cuda: 16 | activities.append(ProfilerActivity.CUDA) 17 | _global_profiler = profile( 18 | activities=activities, 19 | with_stack=with_stack, 20 | record_shapes=record_shapes, 21 | profile_memory=profile_memory, 22 | ) 23 | 24 | def start_profile() : 25 | global _global_profiler 26 | _global_profiler.__enter__() 27 | 28 | def stop_profile(): 29 | global _global_profiler 30 | _global_profiler.__exit__(None, None, None) 31 | 32 | def export_trace(fname): 33 | _global_profiler.export_chrome_trace(fname) -------------------------------------------------------------------------------- /src/sparse_llm_cache/utils/common_metas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, asdict 2 | import re 3 | 4 | from .filter import RegexFilter 5 | 6 | @dataclass 7 | class ModelMetas: 8 | num_moe_layer: int 9 | num_expert_per_layer: int 10 | num_expert_per_token: int 11 | expert_meta_parser: callable 12 | expert_name_filter: callable 13 | moe_attn_name_filter: callable 14 | moe_mlp_name_filter: callable 15 | moe_layer_name_filter: callable 16 | 17 | @staticmethod 18 | def build_deepseek_moe(return_dict = True): 19 | def parse_moe_layer_id(name): 20 | match = re.match(r'.*layers\.(\d+).*', name) 21 | if not match: 22 | return None 23 | return int(match.group(1)) - 1 # first layer is not moe layer 24 | def parse_expert_id(name): 25 | match = re.match(r'.*layers\.\d+\.mlp\.experts\.(\d+).*$', name) 26 | if not match: 27 | return None 28 | return int(match.group(1)) 29 | 30 | def parse_expert_meta_from_name(name): 31 | return (parse_moe_layer_id(name), parse_expert_id(name)) 32 | ret = ModelMetas( 33 | num_moe_layer = 27, 34 | num_expert_per_layer = 64, 35 | num_expert_per_token = 6, 36 | expert_meta_parser = parse_expert_meta_from_name, 37 | expert_name_filter = RegexFilter(r'.*layers\.(\d+)\.mlp\.experts\.(\d+)$'), 38 | moe_attn_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)\.self_attn$'), 39 | moe_mlp_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)\.mlp$'), 40 | moe_layer_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)$') 41 | ) 42 | return asdict(ret) if return_dict else ret 43 | def build_deepseek_moe_simulate(return_dict = True): 44 | def parse_moe_layer_id(name): 45 | match = re.match(r'.*layers\.(\d+).*', name) 46 | if not match: 47 | return None 48 | return int(match.group(1)) 49 | def parse_expert_id(name): 50 | match = re.match(r'.*layers\.\d+\.experts\.(\d+).*$', name) 51 | if not match: 52 | return None 53 | return int(match.group(1)) 54 | 55 | def parse_expert_meta_from_name(name): 56 | return (parse_moe_layer_id(name), parse_expert_id(name)) 57 | ret = ModelMetas( 58 | num_moe_layer = 27, 59 | num_expert_per_layer = 64, 60 | num_expert_per_token = 6, 61 | expert_meta_parser = parse_expert_meta_from_name, 62 | expert_name_filter = RegexFilter(r'.*layers\.(\d+)\.experts\.(\d+)$'), 63 | moe_attn_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)\.self_attn$'), 64 | moe_mlp_name_filter = RegexFilter(r'.*layers\.(\d+).mlp$'), 65 | moe_layer_name_filter = RegexFilter(r'.*layers\.(\d+)$') 66 | ) 67 | return asdict(ret) if return_dict else ret 68 | 69 | def build_deepseek_v2_lite(return_dict = True): 70 | def parse_moe_layer_id(name): 71 | match = re.match(r'.*layers\.(\d+).*', name) 72 | if not match: 73 | return None 74 | return int(match.group(1)) - 1 # first layer is not moe layer 75 | def parse_expert_id(name): 76 | match = re.match(r'.*layers\.\d+\.mlp\.experts\.(\d+).*$', name) 77 | if not match: 78 | return None 79 | return int(match.group(1)) 80 | 81 | def parse_expert_meta_from_name(name): 82 | return (parse_moe_layer_id(name), parse_expert_id(name)) 83 | ret = ModelMetas( 84 | num_moe_layer = 26, 85 | num_expert_per_layer = 64, 86 | num_expert_per_token = 6, 87 | expert_meta_parser = parse_expert_meta_from_name, 88 | expert_name_filter = RegexFilter(r'.*layers\.(\d+)\.mlp\.experts\.(\d+)$'), 89 | moe_attn_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)\.self_attn$'), 90 | moe_mlp_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)\.mlp$'), 91 | moe_layer_name_filter = RegexFilter(r'.*layers\.([1-9]\d*)$') 92 | ) 93 | return asdict(ret) if return_dict else ret 94 | 95 | @staticmethod 96 | def build_qwen_moe(return_dict = True): 97 | def parse_moe_layer_id(name): 98 | match = re.match(r'.*layers\.(\d+).*', name) 99 | if not match: 100 | return None 101 | return int(match.group(1)) 102 | def parse_expert_id(name): 103 | match = re.match(r'.*layers\.\d+\.mlp\.experts\.(\d+).*$', name) 104 | if not match: 105 | return None 106 | return int(match.group(1)) 107 | 108 | def parse_expert_meta_from_name(name): 109 | return (parse_moe_layer_id(name), parse_expert_id(name)) 110 | ret = ModelMetas( 111 | num_moe_layer = 24, 112 | num_expert_per_layer = 60, 113 | num_expert_per_token = 4, 114 | expert_meta_parser = parse_expert_meta_from_name, 115 | expert_name_filter = RegexFilter(r'.*layers\.(\d+)\.mlp\.experts\.(\d+)$'), 116 | moe_attn_name_filter = RegexFilter(r'.*layers\.(\d+)\.self_attn$'), 117 | moe_mlp_name_filter = RegexFilter(r'.*layers\.(\d+)\.mlp$'), 118 | moe_layer_name_filter = RegexFilter(r'.*layers\.(\d+)$') 119 | ) 120 | return asdict(ret) if return_dict else ret 121 | 122 | @staticmethod 123 | def build_qwen2_57b_a14b(return_dict = True): 124 | def parse_moe_layer_id(name): 125 | match = re.match(r'.*layers\.(\d+).*', name) 126 | if not match: 127 | return None 128 | return int(match.group(1)) 129 | def parse_expert_id(name): 130 | match = re.match(r'.*layers\.\d+\.mlp\.experts\.(\d+).*$', name) 131 | if not match: 132 | return None 133 | return int(match.group(1)) 134 | 135 | def parse_expert_meta_from_name(name): 136 | return (parse_moe_layer_id(name), parse_expert_id(name)) 137 | ret = ModelMetas( 138 | num_moe_layer = 28, 139 | num_expert_per_layer = 64, 140 | num_expert_per_token = 8, 141 | expert_meta_parser = parse_expert_meta_from_name, 142 | expert_name_filter = RegexFilter(r'.*layers\.(\d+)\.mlp\.experts\.(\d+)$'), 143 | moe_attn_name_filter = RegexFilter(r'.*layers\.(\d+)\.self_attn$'), 144 | moe_mlp_name_filter = RegexFilter(r'.*layers\.(\d+)\.mlp$'), 145 | moe_layer_name_filter = RegexFilter(r'.*layers\.(\d+)$') 146 | ) 147 | return asdict(ret) if return_dict else ret 148 | 149 | @staticmethod 150 | def build_mixtral(return_dict = True): 151 | def parse_moe_layer_id(name): 152 | match = re.match(r'.*layers\.(\d+).*', name) 153 | if not match: 154 | return None 155 | return int(match.group(1)) 156 | def parse_expert_id(name): 157 | match = re.match(r'.*layers\.\d+\.block_sparse_moe\.experts\.(\d+).*$', name) 158 | if not match: 159 | return None 160 | return int(match.group(1)) 161 | 162 | def parse_expert_meta_from_name(name): 163 | return (parse_moe_layer_id(name), parse_expert_id(name)) 164 | ret = ModelMetas( 165 | num_moe_layer = 32, 166 | num_expert_per_layer = 8, 167 | num_expert_per_token = 2, 168 | expert_meta_parser = parse_expert_meta_from_name, 169 | expert_name_filter = RegexFilter(r'.*layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)$'), 170 | moe_attn_name_filter = RegexFilter(r'.*layers\.(\d+)\.self_attn$'), 171 | moe_mlp_name_filter = RegexFilter(r'.*layers\.(\d+)\.block_sparse_moe$'), 172 | moe_layer_name_filter = RegexFilter(r'.*layers\.(\d+)$') 173 | ) 174 | return asdict(ret) if return_dict else ret 175 | 176 | 177 | predefined_metas = { 178 | 'deepseek-ai/deepseek-moe-16b-chat' : ModelMetas.build_deepseek_moe, 179 | 'deepseek-ai/deepseek-moe-16b-chat-simulate' : ModelMetas.build_deepseek_moe_simulate, 180 | 'deepseek-ai/DeepSeek-V2-Lite-Chat' : ModelMetas.build_deepseek_v2_lite, 181 | 'Qwen/Qwen1.5-MoE-A2.7B-Chat' : ModelMetas.build_qwen_moe, 182 | 'Qwen/Qwen2-57B-A14B-Instruct' : ModelMetas.build_qwen2_57b_a14b, 183 | 'Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4' : ModelMetas.build_qwen2_57b_a14b, 184 | 'mistralai/Mixtral-8x7B-Instruct-v0.1' : ModelMetas.build_mixtral, 185 | 'TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ' : ModelMetas.build_mixtral, 186 | } 187 | 188 | def auto_infer_model_metas(model_id, return_dict = True) -> ModelMetas|dict: 189 | if model_id in predefined_metas: 190 | return predefined_metas[model_id](return_dict) 191 | raise ValueError(f"model {model_id} is not supported") -------------------------------------------------------------------------------- /src/sparse_llm_cache/utils/filter.py: -------------------------------------------------------------------------------- 1 | import re 2 | class Filter: 3 | def __init__(self): 4 | pass 5 | def __call__(self, *args, **kwds): 6 | return True 7 | def reverse(self): 8 | return ReverseFilter(self) 9 | 10 | class ReverseFilter(Filter): 11 | def __init__(self, filter : Filter): 12 | self.filter = filter 13 | def __call__(self, *args, **kwds): 14 | return not self.filter(*args, **kwds) 15 | 16 | class RegexFilter(Filter): 17 | def __init__(self, pattern): 18 | self.pattern = pattern 19 | def __call__(self, name, *args, **kwds): 20 | return re.match(self.pattern, name) != None 21 | -------------------------------------------------------------------------------- /src/sparse_llm_cache/utils/runner_util.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import argparse 3 | 4 | class CustomBooleanAction(argparse.Action): 5 | def __init__(self, option_strings, dest, nargs=None, **kwargs): 6 | if nargs is not None: 7 | raise ValueError("nargs not allowed") 8 | super().__init__(option_strings, dest, **kwargs) 9 | def __call__(self, parser, namespace, values, option_string=None): 10 | values = str(values).lower() 11 | if values in ['true', '1', 'on']: 12 | setattr(namespace, self.dest, True) 13 | elif values in ['false', '0', 'off']: 14 | setattr(namespace, self.dest, False) 15 | else: 16 | raise ValueError("invalid boolean value {}".format(self.values)) 17 | 18 | def prepare_argparser(parser = None): 19 | if parser is None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--model_id", type=str) 22 | parser.add_argument("--model_revision", type=str) 23 | 24 | parser.add_argument("--cache_rate", type=float) 25 | # parser.add_argument("--cache_len", type=int, default=None) 26 | parser.add_argument("--num_predict_expert_per_layer", type=int) 27 | parser.add_argument("--early_preempt", action=CustomBooleanAction) 28 | parser.add_argument("--chunk_prefetch", action=CustomBooleanAction) 29 | parser.add_argument("--reorder_experts", action=CustomBooleanAction) 30 | 31 | parser.add_argument("--predict_input_mode", type=str, choices=["one_token", "decode_cumsum", "last_use_distance", "weighted_decode_cumsum", "first_moe_attn_input_logits", "moe_attn_input_logits", "moe_layer_logits"]) 32 | parser.add_argument("--predictor_type", type=str, choices=["legacy", "sep"]) 33 | 34 | parser.add_argument("--predictor_model_path", type=str) 35 | parser.add_argument("--layer_predict_interval", type=int) 36 | parser.add_argument("--layer_predict_max_window", type=int) 37 | parser.add_argument("--layer_predict_use_last_output", action=CustomBooleanAction, dest='layer_predict_replace_first_input_with_last_output') 38 | 39 | parser.add_argument("--limit_layer_0_window", type=int) 40 | parser.add_argument("--limit_layer_0_num_predict", type=int) 41 | 42 | parser.add_argument("--max_prefetch_layer_distance", type=int) 43 | parser.add_argument("--cache_only", action=CustomBooleanAction) 44 | parser.add_argument("--per_layer_cache", action=CustomBooleanAction) 45 | parser.add_argument("--promote_hit_in_prefetch", action=CustomBooleanAction) 46 | parser.add_argument("--cache_policy", type=str, choices=["lru", "fifo", "nn", "min", "static-1", "static-2"]) 47 | 48 | parser.add_argument("--trace_event", action=CustomBooleanAction) 49 | parser.add_argument("--module_trace_event", action=CustomBooleanAction) 50 | parser.add_argument("--cache_trace_path", type=str) 51 | 52 | parser.add_argument("--max_num_batch", type=int, default=20) 53 | parser.add_argument("--batch_size", type=int, default=1) 54 | parser.add_argument("--dataset", type=str, default='chatgpt-prompts-small') 55 | return parser 56 | 57 | def parse_args(args = None, parser = None): 58 | if parser is None: 59 | parser = prepare_argparser() 60 | args = parser.parse_args(args) 61 | 62 | return vars(args) 63 | -------------------------------------------------------------------------------- /tests/batch-generate-small-prompt/runner.py: -------------------------------------------------------------------------------- 1 | from eval_helper.config import RunConfigBase, OptionCmdLine, OptionEnv, OptionApp, ConfigList, ResultFloat 2 | import os 3 | 4 | my_app = RunConfigBase() 5 | my_app.app = OptionApp('python transformers-app.py', False, False) 6 | my_app.logdir = 'run-logs' 7 | my_app.config_dict = { 8 | 'model_id' : OptionCmdLine('model_id'), 9 | 'model_revision' : OptionCmdLine('model_revision'), 10 | 'dataset' : OptionCmdLine('dataset'), 11 | 'batch_size' : OptionCmdLine('batch_size'), 12 | 'max_num_batch' : OptionCmdLine('max_num_batch'), 13 | 'num_predict_expert_per_layer' : OptionCmdLine('num_predict_expert_per_layer', readable_name='predict', logname='predict'), 14 | 'cache_rate' : OptionCmdLine('cache_rate'), 15 | 'cache_policy' : OptionCmdLine('cache_policy', readable_name='policy', logname='policy'), 16 | 'per_layer_cache' : OptionCmdLine('per_layer_cache', logname=False), 17 | 'reorder_experts' : OptionCmdLine('reorder_experts', logname='reorder'), 18 | 'early_preempt' : OptionCmdLine('early_preempt', logname='early'), 19 | 'max_prefetch_layer_distance' : OptionCmdLine('max_prefetch_layer_distance'), 20 | 'predict_input_mode' : OptionCmdLine('predict_input_mode', logname=False), 21 | 'layer_predict_interval' : OptionCmdLine('layer_predict_interval', logname='p_int'), 22 | 'layer_predict_max_window' : OptionCmdLine('layer_predict_max_window', logname='p_win'), 23 | 'layer_predict_use_last_output' : OptionCmdLine('layer_predict_use_last_output', logname='p_last'), 24 | 'predictor_model_path' : OptionCmdLine('predictor_model_path', readable_name=False, logname=False), 25 | 'trace_event' : OptionCmdLine('trace_event', readable_name=False, logname=False), 26 | 'module_trace_event' : OptionCmdLine('module_trace_event', readable_name=False, logname=False), 27 | 'log_level' : OptionEnv('SPARSE_CACHE_LOG_LEVEL', readable_name=False, logname=False), 28 | 'physical_impl' : OptionEnv('SPARSE_CACHE_PHYSICAL_MEM_IMPL', readable_name='physical_impl', logname=False), 29 | 'logical_impl' : OptionEnv('SPARSE_CACHE_LOGICAL_MEM_IMPL', readable_name='logical_impl', logname=False), 30 | } 31 | 32 | my_app.result_dict = { 33 | 'decode_stage_forward_time' : ResultFloat('decode_stage_forward_time'), 34 | 'prefill_stage_forward_time' : ResultFloat('prefill_stage_forward_time'), 35 | 'decode_stage_hit_rate' : ResultFloat('decode_stage_hit_rate'), 36 | 'prefill_stage_hit_rate' : ResultFloat('prefill_stage_hit_rate'), 37 | } 38 | my_app['per_layer_cache'] = True 39 | base_cfg_list = ConfigList.MakeList(my_app) 40 | 41 | full_list = ConfigList.Empty() 42 | 43 | ### options to control: prefetch, reorder, early_preempt 44 | 45 | template_cfg_list = (base_cfg_list.copy() 46 | .override('cache_policy', [ 47 | # 'nn', 48 | 'lru', 49 | ]) 50 | # .override('batch_size', [1,2,4,8]) 51 | .override('batch_size', [1]) 52 | .override('max_num_batch', [3]) 53 | .override('per_layer_cache', [True]) 54 | # .override('reorder_experts', [True, False]) 55 | # .override('early_preempt', [True, False]) 56 | .override('predict_input_mode', ['moe_layer_logits']) 57 | .override('layer_predict_interval', [ 58 | 1, 59 | # 2, 60 | ]) 61 | .override('layer_predict_max_window', [3]) 62 | .override('layer_predict_use_last_output', [ 63 | True, 64 | # False, 65 | ]) 66 | # .override('trace_event', [True]) 67 | # .override('module_trace_event', [True]) 68 | .override('dataset', ['chatgpt-prompts-small']) 69 | ) 70 | 71 | # full_list.concat(template_cfg_list.copy() 72 | # .override('model_id', ['deepseek-ai/deepseek-moe-16b-chat',]) 73 | # # .override('num_predict_expert_per_layer', [0,6,8,]) 74 | # # .override('num_predict_expert_per_layer', [0,6,]) 75 | # # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 40]]) 76 | # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 77 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits']) 78 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 79 | # [0, False, False], ## weak baseline 80 | # [0, False, True], ## storng baseline 81 | # [0, True, True], ## +opt 82 | # # [6, False, False], 83 | # [6, False, True], ## +p 84 | # [6, True, True], ## +p+opt 85 | # ]) 86 | # ) 87 | # full_list.concat(template_cfg_list.copy() 88 | # .override('model_id', ['deepseek-ai/DeepSeek-V2-Lite-Chat',]) 89 | # # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 90 | # .override('cache_rate', [cache_item/64 for cache_item in [16]]) 91 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--DeepSeek-V2-Lite-Chat/moe-layer-logits']) 92 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 93 | # [0, False, False], ## weak baseline 94 | # [0, False, True], ## storng baseline 95 | # # [0, True, True], ## +opt 96 | # # [6, False, False], 97 | # # [6, False, True], ## +p 98 | # [6, True, True], ## +p+opt 99 | # ]) 100 | # ) 101 | # full_list.concat(template_cfg_list.copy() 102 | # .override('model_id', ['Qwen/Qwen1.5-MoE-A2.7B-Chat',]) 103 | # # .override('num_predict_expert_per_layer', [0,4,6,]) 104 | # # .override('num_predict_expert_per_layer', [0,4,]) 105 | # # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36, 42]]) 106 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36]]) 107 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--Qwen--Qwen1.5-MoE-A2.7B-Chat/moe-layer-logits']) 108 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 109 | # [0, False, False], ## weak baseline 110 | # [0, False, True], ## storng baseline 111 | # [0, True, True], ## +opt 112 | # # [4, False, False], 113 | # [4, False, True], ## +p 114 | # [4, True, True], ## +p+opt 115 | # ]) 116 | # ) 117 | full_list.concat(template_cfg_list.copy() 118 | .override('model_id', ['Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4',]) 119 | # .override('num_predict_expert_per_layer', [0,4,6,]) 120 | # .override('num_predict_expert_per_layer', [0,4,]) 121 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36, 42]]) 122 | .override('layer_predict_use_last_output', [False]) 123 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36]]) 124 | .override('cache_rate', [cache_item/64 for cache_item in [16]]) 125 | .override('predictor_model_path', ['/code/moe/moe-predict-models/models--Qwen--Qwen2-57B-A14B-Instruct/moe-layer-logits']) 126 | .override('physical_impl', ['cudriver_unified']) 127 | .override('logical_impl', ['cudriver_unified']) 128 | .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 129 | [0, False, False], ## weak baseline 130 | [0, False, True], ## storng baseline 131 | # [0, True, True], ## +opt 132 | # [4, False, False], 133 | # [8, False, True], ## +p 134 | [8, True, True], ## +p+opt 135 | ]) 136 | ) 137 | # full_list.concat(template_cfg_list.copy() 138 | # .override('model_id', ['TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ',]) 139 | # # .override('num_predict_expert_per_layer', [0,2,3,]) 140 | # # .override('num_predict_expert_per_layer', [0,2,]) 141 | # .override('cache_rate', [cache_item/8 for cache_item in [1, 2, 3, 4, 5, 6]]) 142 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--mistralai--Mixtral-8x7B-Instruct-v0.1/moe-layer-logits']) 143 | # .override('physical_impl', ['cudriver_unified']) 144 | # .override('logical_impl', ['cudriver_unified']) 145 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 146 | # [0, False, False], ## weak baseline 147 | # [0, False, True], ## storng baseline 148 | # [0, True, True], ## +opt 149 | # # [2, False, False], 150 | # [2, False, True], ## +p 151 | # [2, True, True], ## +p+opt 152 | # ]) 153 | # ) 154 | 155 | if __name__ == '__main__': 156 | from eval_helper.runner_args import parse_args 157 | args = parse_args() 158 | def call_back_fn(cfg : RunConfigBase): 159 | if cfg['trace_event']: 160 | os.system(f'mv trace.json {cfg.get_log_fname()}.json') 161 | os.system(f'mv trace-cuda.json {cfg.get_log_fname()}-cuda.json') 162 | if 'run' in args.commands: 163 | # full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only) 164 | full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only, callback=call_back_fn) 165 | if 'parse' in args.commands: 166 | full_list.override('logdir', [args.logdir]) 167 | full_list.parse() 168 | full_list.to_pdframe([ 169 | 'model_id', 170 | 'batch_size', 171 | # 'per_layer_cache', 172 | 'reorder_experts', 173 | 'early_preempt', 174 | 'num_predict_expert_per_layer', 175 | # 'layer_predict_interval', 176 | 'cache_rate', 177 | # 'layer_predict_use_last_output', 178 | # 'cache_policy', 179 | 'decode_stage_hit_rate', 180 | 'prefill_stage_hit_rate', 181 | 'decode_stage_forward_time', 182 | 'prefill_stage_forward_time', 183 | ]).to_csv(args.parse_output, index=False) 184 | -------------------------------------------------------------------------------- /tests/batch-generate-small-prompt/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 4 | # os.environ['CUDA_VISIBLE_DEVICES'] = '3' 5 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'TRACE' 6 | # os.environ['SPARSE_CACHE_ENABLE_TRACE'] = '1' 7 | os.environ['HF_HUB_OFFLINE'] = "1" 8 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 9 | 10 | from transformers.utils import logging 11 | import torch 12 | from transformers import AutoModelForCausalLM, AutoTokenizer 13 | 14 | import sparse_llm_cache 15 | import time 16 | 17 | from sparse_llm_cache.utils.runner_util import parse_args 18 | cache_configs = parse_args() 19 | for k, v in cache_configs.items(): print(k,v) 20 | 21 | sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 22 | 23 | print("loading model...") 24 | load_time_start = time.time() 25 | logging.disable_progress_bar() 26 | model_id = cache_configs['model_id'] 27 | torch_dtype = 'auto' 28 | if 'Mixtral' in model_id or 'GPTQ' in model_id: 29 | torch_dtype = None 30 | print("dtype is", torch_dtype) 31 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 32 | tokenizer.pad_token = tokenizer.eos_token 33 | model = AutoModelForCausalLM.from_pretrained( 34 | model_id, 35 | # torch_dtype=torch.float16, 36 | torch_dtype=torch_dtype, 37 | # use_flash_attention_2=True, 38 | local_files_only=True, 39 | device_map=0, 40 | trust_remote_code=True, 41 | revision=cache_configs['model_revision'], 42 | ) 43 | if 'Mixtral' in model_id: 44 | import auto_gptq 45 | model = auto_gptq.exllama_set_max_input_length(model, 7200) 46 | print("loading model...done", time.time() - load_time_start) 47 | 48 | def gen_long(text, do_print=False, max_new_tokens=100): 49 | inputs = tokenizer(text, return_tensors="pt").input_ids.to(f"cuda") 50 | outputs = model.generate(inputs, max_new_tokens=max_new_tokens) 51 | output_str = tokenizer.batch_decode(outputs) 52 | if do_print: 53 | print(output_str, flush=True) 54 | return inputs[0].nelement() 55 | 56 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 57 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 58 | input_len = inputs['input_ids'].shape[1] 59 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 60 | output_len = outputs.shape[1] - input_len 61 | outputs = outputs[:, input_len:] 62 | output_len = outputs.shape[1] 63 | output_str = tokenizer.batch_decode(outputs) 64 | if do_print: 65 | print(text_list, output_str, flush=True) 66 | return input_len, output_len 67 | 68 | 69 | # import json 70 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json" 71 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 72 | # dataset_path = "/code/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 73 | # with open(dataset_path) as f: 74 | # dataset = json.load(f) 75 | 76 | # # # Filter out the conversations with less than 2 turns. 77 | # # dataset = [data for data in dataset if len(data["conversations"]) >= 2] 78 | # # # Only keep the first two turns of each conversation. 79 | # # dataset = [(data["conversations"][0]["value"], 80 | # # data["conversations"][1]["value"]) for data in dataset] 81 | 82 | # # Tokenize the prompts and completions. 83 | # prompts = [prompt for prompt, _ in dataset] 84 | # # completions = [completion for _, completion in dataset] 85 | # # completion_token_ids = tokenizer(completions).input_ids 86 | # # tokenized_dataset = [] 87 | # # for i in range(len(dataset)): 88 | # # output_len = len(completion_token_ids[i]) 89 | # # tokenized_dataset.append((prompts[i], output_len)) 90 | 91 | dataset_path = f'/code/moe/datasets/{cache_configs["dataset"]}/prompt_list.pt' 92 | print(dataset_path) 93 | prompts = torch.load(dataset_path) 94 | 95 | from torch.utils.data import Dataset 96 | class StringListDataset(Dataset): 97 | def __init__(self, string_list): 98 | self.string_list = string_list 99 | def __len__(self): 100 | return len(self.string_list) 101 | def __getitem__(self, idx): 102 | return self.string_list[idx] 103 | ds = StringListDataset(prompts) 104 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 105 | 106 | for seq_id,text_list in enumerate(dl): 107 | if seq_id > cache_configs['max_num_batch']: 108 | print("max_num_batch reached") 109 | break 110 | start_time = time.time() 111 | input_len, output_len = gen_batch(text_list, max_new_tokens=20, do_print=True) 112 | print(input_len, output_len, time.time() - start_time, flush=True) -------------------------------------------------------------------------------- /tests/batch-generate/runner.py: -------------------------------------------------------------------------------- 1 | from eval_helper.config import RunConfigBase, OptionCmdLine, OptionEnv, OptionApp, ConfigList, ResultFloat 2 | import os 3 | 4 | my_app = RunConfigBase() 5 | my_app.app = OptionApp('python transformers-app.py', False, False) 6 | my_app.logdir = 'run-logs' 7 | my_app.config_dict = { 8 | 'model_id' : OptionCmdLine('model_id'), 9 | 'model_revision' : OptionCmdLine('model_revision'), 10 | 'dataset' : OptionCmdLine('dataset'), 11 | 'batch_size' : OptionCmdLine('batch_size'), 12 | 'max_num_batch' : OptionCmdLine('max_num_batch'), 13 | 'num_predict_expert_per_layer' : OptionCmdLine('num_predict_expert_per_layer', readable_name='predict', logname='predict'), 14 | 'cache_rate' : OptionCmdLine('cache_rate'), 15 | 'cache_policy' : OptionCmdLine('cache_policy', readable_name='policy', logname='policy'), 16 | 'per_layer_cache' : OptionCmdLine('per_layer_cache', logname=False), 17 | 'reorder_experts' : OptionCmdLine('reorder_experts', logname='reorder'), 18 | 'early_preempt' : OptionCmdLine('early_preempt', logname='early'), 19 | 'max_prefetch_layer_distance' : OptionCmdLine('max_prefetch_layer_distance'), 20 | 'predict_input_mode' : OptionCmdLine('predict_input_mode', logname=False), 21 | 'layer_predict_interval' : OptionCmdLine('layer_predict_interval', logname='p_int'), 22 | 'layer_predict_max_window' : OptionCmdLine('layer_predict_max_window', logname='p_win'), 23 | 'layer_predict_use_last_output' : OptionCmdLine('layer_predict_use_last_output', logname='p_last'), 24 | 'predictor_model_path' : OptionCmdLine('predictor_model_path', readable_name=False, logname=False), 25 | 'trace_event' : OptionCmdLine('trace_event', readable_name=False, logname=False), 26 | 'module_trace_event' : OptionCmdLine('module_trace_event', readable_name=False, logname=False), 27 | 'log_level' : OptionEnv('SPARSE_CACHE_LOG_LEVEL', readable_name=False, logname=False), 28 | 'physical_impl' : OptionEnv('SPARSE_CACHE_PHYSICAL_MEM_IMPL', readable_name='physical_impl', logname=False), 29 | 'logical_impl' : OptionEnv('SPARSE_CACHE_LOGICAL_MEM_IMPL', readable_name='logical_impl', logname=False), 30 | } 31 | 32 | my_app.result_dict = { 33 | 'decode_stage_forward_time' : ResultFloat('decode_stage_forward_time'), 34 | 'prefill_stage_forward_time' : ResultFloat('prefill_stage_forward_time'), 35 | 'decode_stage_hit_rate' : ResultFloat('decode_stage_hit_rate'), 36 | 'prefill_stage_hit_rate' : ResultFloat('prefill_stage_hit_rate'), 37 | } 38 | my_app['per_layer_cache'] = True 39 | base_cfg_list = ConfigList.MakeList(my_app) 40 | 41 | full_list = ConfigList.Empty() 42 | 43 | ### options to control: prefetch, reorder, early_preempt 44 | 45 | template_cfg_list = (base_cfg_list.copy() 46 | .override('cache_policy', [ 47 | # 'nn', 48 | 'lru', 49 | ]) 50 | # .override('batch_size', [1,2,4,8]) 51 | .override('batch_size', [1]) 52 | # .override('max_num_batch', [3]) 53 | .override('per_layer_cache', [True]) 54 | # .override('reorder_experts', [True, False]) 55 | # .override('early_preempt', [True, False]) 56 | .override('predict_input_mode', ['moe_layer_logits']) 57 | .override('layer_predict_interval', [ 58 | 1, 59 | # 2, 60 | ]) 61 | .override('layer_predict_max_window', [3]) 62 | .override('layer_predict_use_last_output', [ 63 | True, 64 | # False, 65 | ]) 66 | # .override('trace_event', [True]) 67 | # .override('module_trace_event', [True]) 68 | .override('dataset', ['chatgpt-prompts-small']) 69 | ) 70 | 71 | # full_list.concat(template_cfg_list.copy() 72 | # .override('model_id', ['deepseek-ai/deepseek-moe-16b-chat',]) 73 | # # .override('num_predict_expert_per_layer', [0,6,8,]) 74 | # # .override('num_predict_expert_per_layer', [0,6,]) 75 | # # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32, 40]]) 76 | # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 77 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits']) 78 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 79 | # [0, False, False], ## weak baseline 80 | # [0, False, True], ## storng baseline 81 | # [0, True, True], ## +opt 82 | # # [6, False, False], 83 | # [6, False, True], ## +p 84 | # [6, True, True], ## +p+opt 85 | # ]) 86 | # ) 87 | # full_list.concat(template_cfg_list.copy() 88 | # .override('model_id', ['deepseek-ai/DeepSeek-V2-Lite-Chat',]) 89 | # # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 90 | # .override('cache_rate', [cache_item/64 for cache_item in [16]]) 91 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--DeepSeek-V2-Lite-Chat/moe-layer-logits']) 92 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 93 | # [0, False, False], ## weak baseline 94 | # [0, False, True], ## storng baseline 95 | # # [0, True, True], ## +opt 96 | # # [6, False, False], 97 | # # [6, False, True], ## +p 98 | # [6, True, True], ## +p+opt 99 | # ]) 100 | # ) 101 | # full_list.concat(template_cfg_list.copy() 102 | # .override('model_id', ['Qwen/Qwen1.5-MoE-A2.7B-Chat',]) 103 | # # .override('num_predict_expert_per_layer', [0,4,6,]) 104 | # # .override('num_predict_expert_per_layer', [0,4,]) 105 | # # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36, 42]]) 106 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36]]) 107 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--Qwen--Qwen1.5-MoE-A2.7B-Chat/moe-layer-logits']) 108 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 109 | # [0, False, False], ## weak baseline 110 | # [0, False, True], ## storng baseline 111 | # [0, True, True], ## +opt 112 | # # [4, False, False], 113 | # [4, False, True], ## +p 114 | # [4, True, True], ## +p+opt 115 | # ]) 116 | # ) 117 | full_list.concat(template_cfg_list.copy() 118 | .override('model_id', ['Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4',]) 119 | # .override('num_predict_expert_per_layer', [0,4,6,]) 120 | # .override('num_predict_expert_per_layer', [0,4,]) 121 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36, 42]]) 122 | .override('layer_predict_use_last_output', [False]) 123 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36]]) 124 | .override('cache_rate', [cache_item/64 for cache_item in [16]]) 125 | .override('predictor_model_path', ['/code/moe/moe-predict-models/models--Qwen--Qwen2-57B-A14B-Instruct/moe-layer-logits']) 126 | .override('physical_impl', ['cudriver_unified']) 127 | .override('logical_impl', ['cudriver_unified']) 128 | .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 129 | [0, False, False], ## weak baseline 130 | [0, False, True], ## storng baseline 131 | # [0, True, True], ## +opt 132 | # [4, False, False], 133 | # [8, False, True], ## +p 134 | [8, True, True], ## +p+opt 135 | ]) 136 | ) 137 | # full_list.concat(template_cfg_list.copy() 138 | # .override('model_id', ['TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ',]) 139 | # # .override('num_predict_expert_per_layer', [0,2,3,]) 140 | # # .override('num_predict_expert_per_layer', [0,2,]) 141 | # .override('cache_rate', [cache_item/8 for cache_item in [1, 2, 3, 4, 5, 6]]) 142 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--mistralai--Mixtral-8x7B-Instruct-v0.1/moe-layer-logits']) 143 | # .override('physical_impl', ['cudriver_unified']) 144 | # .override('logical_impl', ['cudriver_unified']) 145 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 146 | # [0, False, False], ## weak baseline 147 | # [0, False, True], ## storng baseline 148 | # [0, True, True], ## +opt 149 | # # [2, False, False], 150 | # [2, False, True], ## +p 151 | # [2, True, True], ## +p+opt 152 | # ]) 153 | # ) 154 | 155 | if __name__ == '__main__': 156 | from eval_helper.runner_args import parse_args 157 | args = parse_args() 158 | def call_back_fn(cfg : RunConfigBase): 159 | if cfg['trace_event']: 160 | os.system(f'mv trace.json {cfg.get_log_fname()}.json') 161 | os.system(f'mv trace-cuda.json {cfg.get_log_fname()}-cuda.json') 162 | if 'run' in args.commands: 163 | # full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only) 164 | full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only, callback=call_back_fn) 165 | if 'parse' in args.commands: 166 | full_list.override('logdir', [args.logdir]) 167 | full_list.parse() 168 | full_list.to_pdframe([ 169 | 'model_id', 170 | 'batch_size', 171 | # 'per_layer_cache', 172 | 'reorder_experts', 173 | 'early_preempt', 174 | 'num_predict_expert_per_layer', 175 | # 'layer_predict_interval', 176 | 'cache_rate', 177 | # 'layer_predict_use_last_output', 178 | # 'cache_policy', 179 | 'decode_stage_hit_rate', 180 | 'prefill_stage_hit_rate', 181 | 'decode_stage_forward_time', 182 | 'prefill_stage_forward_time', 183 | ]).to_csv(args.parse_output, index=False) 184 | -------------------------------------------------------------------------------- /tests/batch-generate/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 4 | # os.environ['CUDA_VISIBLE_DEVICES'] = '3' 5 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'TRACE' 6 | # os.environ['SPARSE_CACHE_ENABLE_TRACE'] = '1' 7 | os.environ['HF_HUB_OFFLINE'] = "1" 8 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 9 | 10 | from transformers.utils import logging 11 | import torch 12 | from transformers import AutoModelForCausalLM, AutoTokenizer 13 | 14 | import sparse_llm_cache 15 | import time 16 | 17 | from sparse_llm_cache.utils.runner_util import parse_args 18 | cache_configs = parse_args() 19 | for k, v in cache_configs.items(): print(k,v) 20 | 21 | sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 22 | 23 | print("loading model...") 24 | load_time_start = time.time() 25 | logging.disable_progress_bar() 26 | model_id = cache_configs['model_id'] 27 | torch_dtype = 'auto' 28 | if 'Mixtral' in model_id or 'GPTQ' in model_id: 29 | torch_dtype = None 30 | print("dtype is", torch_dtype) 31 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 32 | tokenizer.pad_token = tokenizer.eos_token 33 | model = AutoModelForCausalLM.from_pretrained( 34 | model_id, 35 | # torch_dtype=torch.float16, 36 | torch_dtype=torch_dtype, 37 | # use_flash_attention_2=True, 38 | local_files_only=True, 39 | device_map=0, 40 | trust_remote_code=True, 41 | revision=cache_configs['model_revision'], 42 | ) 43 | if 'Mixtral' in model_id: 44 | import auto_gptq 45 | model = auto_gptq.exllama_set_max_input_length(model, 7200) 46 | print("loading model...done", time.time() - load_time_start) 47 | 48 | def gen_long(text, do_print=False, max_new_tokens=100): 49 | inputs = tokenizer(text, return_tensors="pt").input_ids.to(f"cuda") 50 | outputs = model.generate(inputs, max_new_tokens=max_new_tokens) 51 | output_str = tokenizer.batch_decode(outputs) 52 | if do_print: 53 | print(output_str, flush=True) 54 | return inputs[0].nelement() 55 | 56 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 57 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 58 | input_len = inputs['input_ids'].shape[1] 59 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) 60 | output_len = outputs.shape[1] - input_len 61 | outputs = outputs[:, input_len:] 62 | output_len = outputs.shape[1] 63 | output_str = tokenizer.batch_decode(outputs) 64 | if do_print: 65 | print(text_list, output_str, flush=True) 66 | return input_len, output_len 67 | 68 | 69 | # import json 70 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json" 71 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 72 | # dataset_path = "/code/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 73 | # with open(dataset_path) as f: 74 | # dataset = json.load(f) 75 | 76 | # # # Filter out the conversations with less than 2 turns. 77 | # # dataset = [data for data in dataset if len(data["conversations"]) >= 2] 78 | # # # Only keep the first two turns of each conversation. 79 | # # dataset = [(data["conversations"][0]["value"], 80 | # # data["conversations"][1]["value"]) for data in dataset] 81 | 82 | # # Tokenize the prompts and completions. 83 | # prompts = [prompt for prompt, _ in dataset] 84 | # # completions = [completion for _, completion in dataset] 85 | # # completion_token_ids = tokenizer(completions).input_ids 86 | # # tokenized_dataset = [] 87 | # # for i in range(len(dataset)): 88 | # # output_len = len(completion_token_ids[i]) 89 | # # tokenized_dataset.append((prompts[i], output_len)) 90 | 91 | dataset_path = f'/code/moe/datasets/{cache_configs["dataset"]}/prompt_list.pt' 92 | print(dataset_path) 93 | prompts = torch.load(dataset_path) 94 | 95 | from torch.utils.data import Dataset 96 | class StringListDataset(Dataset): 97 | def __init__(self, string_list): 98 | self.string_list = string_list 99 | def __len__(self): 100 | return len(self.string_list) 101 | def __getitem__(self, idx): 102 | return self.string_list[idx] 103 | ds = StringListDataset(prompts) 104 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 105 | 106 | for seq_id,text_list in enumerate(dl): 107 | if seq_id > cache_configs['max_num_batch']: 108 | print("max_num_batch reached") 109 | break 110 | start_time = time.time() 111 | try: 112 | input_len, output_len = gen_batch(text_list, max_new_tokens=128, do_print=True) 113 | except Exception as e: 114 | print(f"error at seq {seq_id}") 115 | print(str(e)) 116 | print(input_len, output_len, time.time() - start_time, flush=True) -------------------------------------------------------------------------------- /tests/min-cache/runner.py: -------------------------------------------------------------------------------- 1 | from eval_helper.config import RunConfigBase, OptionCmdLine, OptionEnv, OptionApp, ConfigList, ResultFloat 2 | import os 3 | 4 | my_app = RunConfigBase() 5 | my_app.app = OptionApp('python transformers-app.py', False, False) 6 | my_app.logdir = 'run-logs' 7 | my_app.config_dict = { 8 | 'model_id' : OptionCmdLine('model_id'), 9 | 'model_revision' : OptionCmdLine('model_revision'), 10 | 'dataset' : OptionCmdLine('dataset'), 11 | 'batch_size' : OptionCmdLine('batch_size'), 12 | 'max_num_batch' : OptionCmdLine('max_num_batch'), 13 | 'num_predict_expert_per_layer' : OptionCmdLine('num_predict_expert_per_layer', readable_name='predict', logname='predict'), 14 | 'cache_rate' : OptionCmdLine('cache_rate'), 15 | 'cache_policy' : OptionCmdLine('cache_policy', readable_name='policy', logname='policy'), 16 | 'per_layer_cache' : OptionCmdLine('per_layer_cache', logname=False), 17 | 'reorder_experts' : OptionCmdLine('reorder_experts', logname='reorder'), 18 | 'early_preempt' : OptionCmdLine('early_preempt', logname='early'), 19 | 'max_prefetch_layer_distance' : OptionCmdLine('max_prefetch_layer_distance'), 20 | 'predict_input_mode' : OptionCmdLine('predict_input_mode', logname=False), 21 | 'layer_predict_interval' : OptionCmdLine('layer_predict_interval', logname='p_int'), 22 | 'layer_predict_max_window' : OptionCmdLine('layer_predict_max_window', logname='p_win'), 23 | 'layer_predict_use_last_output' : OptionCmdLine('layer_predict_use_last_output', logname='p_last'), 24 | 'predictor_model_path' : OptionCmdLine('predictor_model_path', readable_name=False, logname=False), 25 | 'cache_trace_path' : OptionCmdLine('cache_trace_path', readable_name=False, logname=False), 26 | 'trace_event' : OptionCmdLine('trace_event', readable_name=False, logname=False), 27 | 'module_trace_event' : OptionCmdLine('module_trace_event', readable_name=False, logname=False), 28 | 'log_level' : OptionEnv('SPARSE_CACHE_LOG_LEVEL', readable_name=False, logname=False), 29 | 'physical_impl' : OptionEnv('SPARSE_CACHE_PHYSICAL_MEM_IMPL', readable_name='physical_impl', logname=False), 30 | 'logical_impl' : OptionEnv('SPARSE_CACHE_LOGICAL_MEM_IMPL', readable_name='logical_impl', logname=False), 31 | } 32 | 33 | my_app.result_dict = { 34 | 'decode_stage_forward_time' : ResultFloat('decode_stage_forward_time'), 35 | 'prefill_stage_forward_time' : ResultFloat('prefill_stage_forward_time'), 36 | 'decode_stage_hit_rate' : ResultFloat('decode_stage_hit_rate'), 37 | 'prefill_stage_hit_rate' : ResultFloat('prefill_stage_hit_rate'), 38 | } 39 | my_app['per_layer_cache'] = True 40 | base_cfg_list = ConfigList.MakeList(my_app) 41 | 42 | full_list = ConfigList.Empty() 43 | 44 | ### options to control: prefetch, reorder, early_preempt 45 | 46 | template_cfg_list = (base_cfg_list.copy() 47 | .override('cache_policy', [ 48 | # 'nn', 49 | # 'lru', 50 | 'min', 51 | ]) 52 | # .override('batch_size', [1,2,4,8]) 53 | .override('batch_size', [1]) 54 | # .override('max_num_batch', [3]) 55 | .override('per_layer_cache', [True]) 56 | # .override('reorder_experts', [True, False]) 57 | # .override('early_preempt', [True, False]) 58 | .override('predict_input_mode', ['moe_layer_logits']) 59 | .override('layer_predict_interval', [ 60 | 1, 61 | # 2, 62 | ]) 63 | .override('layer_predict_max_window', [3]) 64 | .override('layer_predict_use_last_output', [ 65 | # True, 66 | False, 67 | ]) 68 | # .override('trace_event', [True]) 69 | # .override('module_trace_event', [True]) 70 | .override('dataset', ['chatgpt-prompts-small']) 71 | ) 72 | 73 | full_list.concat(template_cfg_list.copy() 74 | .override('model_id', ['deepseek-ai/deepseek-moe-16b-chat',]) 75 | # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 76 | .override('cache_rate', [cache_item/64 for cache_item in [16]]) 77 | .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--deepseek-moe-16b-chat/moe-layer-logits']) 78 | .override('cache_trace_path', ['/code/moe/moe-traces/min-cache-trace/deepseek-moe-chatgpt-prompt-small-0812']) 79 | .hyper_override(['cache_policy', 'num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 80 | # [0, False, False], ## weak baseline 81 | # [0, False, True], ## storng baseline 82 | # ['min', 0, True, True], ## +opt 83 | # ['lru', 0, True, True], ## +opt 84 | # [6, False, True], ## +p 85 | ['min', 6, True, True], ## +p+opt 86 | # ['lru', 6, True, True], ## +p+opt 87 | ]) 88 | ) 89 | # full_list.concat(template_cfg_list.copy() 90 | # .override('model_id', ['deepseek-ai/DeepSeek-V2-Lite-Chat',]) 91 | # # .override('cache_rate', [cache_item/64 for cache_item in [1, 2, 4, 8, 12, 16, 24, 32]]) 92 | # .override('cache_rate', [cache_item/64 for cache_item in [16]]) 93 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--deepseek-ai--DeepSeek-V2-Lite-Chat/moe-layer-logits']) 94 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 95 | # [0, False, False], ## weak baseline 96 | # [0, False, True], ## storng baseline 97 | # # [0, True, True], ## +opt 98 | # # [6, False, False], 99 | # # [6, False, True], ## +p 100 | # [6, True, True], ## +p+opt 101 | # ]) 102 | # ) 103 | # full_list.concat(template_cfg_list.copy() 104 | # .override('model_id', ['Qwen/Qwen1.5-MoE-A2.7B-Chat',]) 105 | # # .override('num_predict_expert_per_layer', [0,4,6,]) 106 | # # .override('num_predict_expert_per_layer', [0,4,]) 107 | # # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36, 42]]) 108 | # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36]]) 109 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--Qwen--Qwen1.5-MoE-A2.7B-Chat/moe-layer-logits']) 110 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 111 | # [0, False, False], ## weak baseline 112 | # [0, False, True], ## storng baseline 113 | # [0, True, True], ## +opt 114 | # # [4, False, False], 115 | # [4, False, True], ## +p 116 | # [4, True, True], ## +p+opt 117 | # ]) 118 | # ) 119 | # full_list.concat(template_cfg_list.copy() 120 | # .override('model_id', ['Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4',]) 121 | # # .override('num_predict_expert_per_layer', [0,4,6,]) 122 | # # .override('num_predict_expert_per_layer', [0,4,]) 123 | # # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36, 42]]) 124 | # .override('layer_predict_use_last_output', [False]) 125 | # # .override('cache_rate', [cache_item/60 for cache_item in [1, 2, 4, 8, 12, 16, 24, 30, 36]]) 126 | # .override('cache_rate', [cache_item/64 for cache_item in [16]]) 127 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--Qwen--Qwen2-57B-A14B-Instruct/moe-layer-logits']) 128 | # .override('physical_impl', ['cudriver_unified']) 129 | # .override('logical_impl', ['cudriver_unified']) 130 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 131 | # [0, False, False], ## weak baseline 132 | # [0, False, True], ## storng baseline 133 | # # [0, True, True], ## +opt 134 | # # [4, False, False], 135 | # # [8, False, True], ## +p 136 | # [8, True, True], ## +p+opt 137 | # ]) 138 | # ) 139 | # full_list.concat(template_cfg_list.copy() 140 | # .override('model_id', ['TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ',]) 141 | # # .override('num_predict_expert_per_layer', [0,2,3,]) 142 | # # .override('num_predict_expert_per_layer', [0,2,]) 143 | # .override('cache_rate', [cache_item/8 for cache_item in [1, 2, 3, 4, 5, 6]]) 144 | # .override('predictor_model_path', ['/code/moe/moe-predict-models/models--mistralai--Mixtral-8x7B-Instruct-v0.1/moe-layer-logits']) 145 | # .override('physical_impl', ['cudriver_unified']) 146 | # .override('logical_impl', ['cudriver_unified']) 147 | # .hyper_override(['num_predict_expert_per_layer', 'reorder_experts', 'early_preempt'], [ 148 | # [0, False, False], ## weak baseline 149 | # [0, False, True], ## storng baseline 150 | # [0, True, True], ## +opt 151 | # # [2, False, False], 152 | # [2, False, True], ## +p 153 | # [2, True, True], ## +p+opt 154 | # ]) 155 | # ) 156 | 157 | if __name__ == '__main__': 158 | from eval_helper.runner_args import parse_args 159 | args = parse_args() 160 | def call_back_fn(cfg : RunConfigBase): 161 | if cfg['trace_event']: 162 | os.system(f'mv trace.json {cfg.get_log_fname()}.json') 163 | os.system(f'mv trace-cuda.json {cfg.get_log_fname()}-cuda.json') 164 | if 'run' in args.commands: 165 | # full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only) 166 | full_list.run(mock=args.mock, durable_log=args.durable_log, fail_only=args.fail_only, callback=call_back_fn) 167 | if 'parse' in args.commands: 168 | full_list.override('logdir', [args.logdir]) 169 | full_list.parse() 170 | full_list.to_pdframe([ 171 | 'model_id', 172 | 'batch_size', 173 | # 'per_layer_cache', 174 | 'reorder_experts', 175 | 'early_preempt', 176 | 'num_predict_expert_per_layer', 177 | # 'layer_predict_interval', 178 | 'cache_rate', 179 | # 'layer_predict_use_last_output', 180 | # 'cache_policy', 181 | 'decode_stage_hit_rate', 182 | 'prefill_stage_hit_rate', 183 | 'decode_stage_forward_time', 184 | 'prefill_stage_forward_time', 185 | ]).to_csv(args.parse_output, index=False) 186 | -------------------------------------------------------------------------------- /tests/min-cache/transformers-app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 4 | # os.environ['CUDA_VISIBLE_DEVICES'] = '3' 5 | # os.environ['SPARSE_CACHE_LOG_LEVEL'] = 'TRACE' 6 | # os.environ['SPARSE_CACHE_ENABLE_TRACE'] = '1' 7 | os.environ['HF_HUB_OFFLINE'] = "1" 8 | os.environ['HUGGINGFACE_OFFLINE'] = "1" 9 | 10 | from transformers.utils import logging 11 | import torch 12 | from transformers import AutoModelForCausalLM, AutoTokenizer 13 | 14 | import sparse_llm_cache 15 | import time 16 | 17 | from sparse_llm_cache.utils.runner_util import parse_args 18 | cache_configs = parse_args() 19 | for k, v in cache_configs.items(): print(k,v) 20 | 21 | sparse_llm_cache.utils.hack_transformers(**cache_configs, pin_memory=True, enable_model_timer=True) 22 | 23 | print("loading model...") 24 | load_time_start = time.time() 25 | logging.disable_progress_bar() 26 | model_id = cache_configs['model_id'] 27 | torch_dtype = 'auto' 28 | if 'Mixtral' in model_id or 'GPTQ' in model_id: 29 | torch_dtype = None 30 | print("dtype is", torch_dtype) 31 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 32 | tokenizer.pad_token = tokenizer.eos_token 33 | model = AutoModelForCausalLM.from_pretrained( 34 | model_id, 35 | # torch_dtype=torch.float16, 36 | torch_dtype=torch_dtype, 37 | # use_flash_attention_2=True, 38 | local_files_only=True, 39 | device_map=0, 40 | trust_remote_code=True, 41 | revision=cache_configs['model_revision'], 42 | ) 43 | 44 | if cache_configs['cache_policy'] == 'min': 45 | from sparse_llm_cache.utils import RegexFilter, recursive_traverse_childrens 46 | gate_score_map = torch.load(f"{cache_configs['cache_trace_path']}/gate_score_map.pt") 47 | for gate_score_list in gate_score_map.values(): 48 | for i in range(len(gate_score_list)): 49 | gate_score_list[i] = gate_score_list[i].to("cuda") 50 | gate_score_list = None 51 | 52 | next_idx = 0 53 | def replace_gate_report(model, filter): 54 | def f(module, name): 55 | def new_report_gate_scores(_): 56 | global next_idx 57 | ret = gate_score_list[next_idx] 58 | next_idx += 1 59 | return ret 60 | module.report_gate_scores = new_report_gate_scores 61 | recursive_traverse_childrens(model, f, filter) 62 | 63 | replace_gate_report(model, filter = RegexFilter(r'.*layers\.([1-9]\d*)\.mlp.gate$')) 64 | 65 | if 'Mixtral' in model_id: 66 | import auto_gptq 67 | model = auto_gptq.exllama_set_max_input_length(model, 7200) 68 | print("loading model...done", time.time() - load_time_start) 69 | 70 | def gen_long(text, do_print=False, max_new_tokens=100): 71 | inputs = tokenizer(text, return_tensors="pt").input_ids.to(f"cuda") 72 | outputs = model.generate(inputs, max_new_tokens=max_new_tokens) 73 | output_str = tokenizer.batch_decode(outputs) 74 | if do_print: 75 | print(output_str, flush=True) 76 | return inputs[0].nelement() 77 | 78 | def gen_batch(text_list, do_print=False, max_new_tokens=100): 79 | inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(f"cuda") # input_ids, attention_mask 80 | input_len = inputs['input_ids'].shape[1] 81 | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1) 82 | output_len = outputs.shape[1] - input_len 83 | outputs = outputs[:, input_len:] 84 | output_len = outputs.shape[1] 85 | output_str = tokenizer.batch_decode(outputs) 86 | if do_print: 87 | print(text_list, output_str, flush=True) 88 | return input_len, output_len 89 | 90 | 91 | # import json 92 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json" 93 | # # dataset_path = "/nvme/sxn/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 94 | # dataset_path = "/code/moe/datasets/vllm-benchmark/shareGPT/small-dataset.json" 95 | # with open(dataset_path) as f: 96 | # dataset = json.load(f) 97 | 98 | # # # Filter out the conversations with less than 2 turns. 99 | # # dataset = [data for data in dataset if len(data["conversations"]) >= 2] 100 | # # # Only keep the first two turns of each conversation. 101 | # # dataset = [(data["conversations"][0]["value"], 102 | # # data["conversations"][1]["value"]) for data in dataset] 103 | 104 | # # Tokenize the prompts and completions. 105 | # prompts = [prompt for prompt, _ in dataset] 106 | # # completions = [completion for _, completion in dataset] 107 | # # completion_token_ids = tokenizer(completions).input_ids 108 | # # tokenized_dataset = [] 109 | # # for i in range(len(dataset)): 110 | # # output_len = len(completion_token_ids[i]) 111 | # # tokenized_dataset.append((prompts[i], output_len)) 112 | 113 | dataset_path = f'/code/moe/datasets/{cache_configs["dataset"]}/prompt_list.pt' 114 | print(dataset_path) 115 | prompts = torch.load(dataset_path) 116 | 117 | from torch.utils.data import Dataset 118 | class StringListDataset(Dataset): 119 | def __init__(self, string_list): 120 | self.string_list = string_list 121 | def __len__(self): 122 | return len(self.string_list) 123 | def __getitem__(self, idx): 124 | return self.string_list[idx] 125 | ds = StringListDataset(prompts) 126 | dl = torch.utils.data.DataLoader(ds, batch_size=cache_configs['batch_size'], shuffle=False) 127 | 128 | for seq_id,text_list in enumerate(dl): 129 | if seq_id >= cache_configs['max_num_batch']: 130 | print("max_num_batch reached") 131 | break 132 | start_time = time.time() 133 | if cache_configs['cache_policy'] == 'min': 134 | model._prefetch_mngr.cache.set_cur_seq(seq_id) 135 | gate_score_list = gate_score_map[seq_id] 136 | next_idx = 0 137 | input_len, output_len = gen_batch(text_list, max_new_tokens=len(gate_score_list)//model._prefetch_mngr.metas.num_layer, do_print=True) 138 | print(input_len, output_len, time.time() - start_time, flush=True) --------------------------------------------------------------------------------